# YanCheng Vehicle Sales Prediction

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import ShuffleSplit,GridSearchCV,StratifiedShuffleSplit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR
from sklearn.linear_model import LogisticRegression,SGDRegressor
from sklearn.preprocessing import OneHotEncoder
train=pd.read_csv('../data/train.csv')
test=pd.read_csv('../data/test.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
grouped=train["sale_quantity"].groupby([train["class_id"],train["sale_date"]])
grouped.mean()

class_id  sale_date
103507    201503        19.333333
          201504        38.666667
          201505        37.666667
          201506        40.857143
          201507        49.500000
          201508        88.750000
          201509        63.142857
          201510       175.000000
          201511        80.166667
          201512       286.600000
          201601       110.333333
          201602       119.125000
          201603        92.857143
          201604        72.500000
          201605       193.000000
          201606       207.250000
          201607        94.500000
          201608       135.333333
          201609       228.000000
          201610       210.666667
          201611       292.000000
          201612       826.500000
          201701       249.500000
          201702       217.500000
          201703       123.666667
          201704       145.000000
          201705       214.500000
          201706       185.500000
          201707       156.5

In [18]:
grouped1=train["sale_quantity"].groupby(train["class_id"])
grouped1.mean()

class_id
103507    132.441860
124140     85.778626
125403     48.156863
136916     63.141618
169673    116.200000
175962     83.241935
178529    138.271812
186250    128.666667
194201    308.500000
194450     45.166667
198427     53.584416
206765    402.322222
209945     71.039474
219195     70.466102
221795     76.800000
245609     33.275862
248352    137.930085
249875     36.400000
250658     93.566667
265980    123.037267
270690    295.809091
281301    156.993902
281792    103.373913
289386     82.175439
289403     64.416667
290854     50.726316
291086    213.088710
291514     37.857143
302513     49.666667
304458     76.432277
             ...    
745137    118.427938
750340     78.078431
760412     14.960000
786351    167.917476
789290     18.235294
810398     44.913043
815230    107.500000
819061    180.825581
842246    154.400000
851857     89.500000
854079    103.429577
854548     40.052632
861459    111.466667
871642     57.311377
872180     90.500000
883691     99.671642
8901

#数据预处理
columnNames=["brand_id","class_id"]
for colums in columnNames:
    train_dummies=pd.get_dummies(train[colums],prefix=colums)
    train=pd.concat([train,train_dummies],axis=1)
    {
            "name": "RandomForestRegressor",
            "estimator": SGDRegressor(random_state=0),
            "hyperparameters":
            {
                "penalty": ["l2","l1","elasticnet"],
                "alpha": np.logspace(-9, 3, 13),
                "learning_rate": ["constant","optimal","invscaling"]
            }
        },
        {
            "name": "SVR",
            "estimator": SVR(),
            "hyperparameters":
            {
                "kernel": ["rbf"],
                "C": np.logspace(-9, 3, 13),
                "gamma": np.logspace(-9, 3, 13)
            }
        }

In [19]:
t=pd.concat([train["brand_id"],train["class_id"],train["sale_date"]],axis=1)
t.head(5)
b=OneHotEncoder()
brand=b.fit(t)

In [20]:
#模型选择
def get_model(df,features):
    train_x=df[features]
    train_y=df["sale_quantity"]
    cv=ShuffleSplit(n_splits=10,train_size=0.7,test_size=0.3,random_state=0)
    model_param=[
        {
            "name": "AdaBoostRegressor",
            "estimator": AdaBoostRegressor(random_state=0),
            "hyperparameters":
            {
                "n_estimators": [20,50,80,110],
                "learning_rate":  np.logspace(-9, 3, 13)
            }
        },
        {
            "name": "RandomForestRegressor",
            "estimator": SGDRegressor(random_state=0),
            "hyperparameters":
            {
                "penalty": ["l2","l1","elasticnet"],
                "alpha": np.logspace(-9, 3, 13),
                "learning_rate": ["constant","optimal","invscaling"]
            }
        }
    ]
    models=[]
    for model in model_param:
        grid=GridSearchCV(estimator=model["estimator"],param_grid=model["hyperparameters"],cv=10)
        grid.fit(train_x,train_y)
    
        model_att={
            "model": grid.best_estimator_,
            "best_param": grid.best_params_,
            "best_score": grid.best_score_,
            "grid":grid    
        }
        models.append(model_att)
        print("model and its parameters:")
        print(grid.best_params_)
        print(grid.best_score_)
    return models

In [21]:
## feature selection using RFECV
def get_features(df,features,model=None):
    newDf=df.copy()
    newDf = newDf.select_dtypes(['number'])
    newDf = newDf.dropna(axis=1, how='any')
    all_X = newDf[features]
    all_y = df["sale_quantity"]
    cv=StratifiedShuffleSplit(n_splits=10,train_size=.7,test_size=.3,random_state=0)
    if model==None:
        regressor=AdaBoostRegressorn(n_estimators=100)
    else:
        regressor=model
    selector = RFECV(regressor, scoring = 'roc_auc', cv=cv, step = 1)
    selector.fit(all_X,all_y) 
    rfecv_columns = all_X.columns[selector.support_]
    return rfecv_columns

In [27]:
features=["brand_id","class_id"]
models = get_model(train,features)

model and its parameters:
{'learning_rate': 1e-08, 'n_estimators': 110}
-0.324760194901
model and its parameters:
{'alpha': 1000.0, 'learning_rate': 'optimal', 'penalty': 'l2'}
-3.75450799831e+26


In [28]:
#select the best one based on its index from console
best_grid=models[0]["grid"]
best_regressor=models[0]["model"]
best_param=models[0]["best_param"]

In [32]:
predictions=best_regressor.predict(pd.concat([test["predict_date"],test["class_id"]],axis=1))

In [33]:
sub={"predict_date": test["predict_date"],"class_id": test["class_id"],"predict_quantity": predictions}
submission=pd.DataFrame(sub)
submission.to_csv(path_or_buf="Submission.csv", index=False, header=True)