In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('TrainOnMe.csv')

df = df.drop(columns=['Unnamed: 0', 'x12'])


In [2]:
df.describe(include="all")

Unnamed: 0,y,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x13
count,5000,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000,5000.0,5000.0,5000.0,5000.0,5000.0
unique,3,,,,,,,5,,,,,
top,Tesla,,,,,,,EBIT/Wh,,,,,
freq,2073,,,,,,,1706,,,,,
mean,,199.999115,0.014573,-99.948342,-1.053022,229.938275,-121.014496,,0.013802,-0.005572,11.069748,-0.377698,950.021404
std,,1.014776,0.70757,3.169226,0.00661,1.014779,1.795108,,1.209435,1.238413,2.744753,3.356351,4.986109
min,,196.5198,-2.54698,-112.55433,-1.07099,226.47226,-127.40319,,-4.34935,-5.14099,0.67229,-11.23297,932.91008
25%,,199.30946,-0.471633,-102.090328,-1.05728,229.2462,-122.23214,,-0.707843,-0.740918,9.242318,-2.620452,946.660668
50%,,199.990365,0.01809,-100.00584,-1.05328,229.930135,-120.99636,,-0.00935,-0.004245,11.05434,-0.59176,950.001265
75%,,200.703677,0.50302,-97.88407,-1.04816,230.642983,-119.81581,,0.751287,0.754335,12.81948,1.79637,953.383755


In [3]:
df.x7.unique()

array(['AI', 'EBIT/Wh', 'Q2', 'Q3', 'Q1'], dtype=object)

In [4]:
df.y.unique()

array(['Tesla', 'SpaceX', 'TwitterX'], dtype=object)

In [5]:
from sklearn.utils import shuffle

In [6]:
RANDOM_STATE = 6

X = df.drop(columns=['y'])
Y = df.y
X_train, y_train = shuffle(X, Y, random_state=RANDOM_STATE)

In [8]:
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['float64']).columns

Index(['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x8', 'x9', 'x10', 'x11', 'x13'], dtype='object')


In [10]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

#We encode the categorical features

categorical_transformer = Pipeline([
    ('encoder', OneHotEncoder())
])

from sklearn.decomposition import PCA

#We use a PCA on the numerical features 

numerical_transformer = Pipeline([
    ('pca', PCA(n_components=11)),
])

from sklearn.compose import ColumnTransformer

#We proceed to the columns transformation

preprocessor = ColumnTransformer([
    ('num', numerical_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
])

In [12]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, \
GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifierCV
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

classifiers = {
    "Gradient Boosting": GradientBoostingClassifier(random_state=RANDOM_STATE),
    "K-neighbours": KNeighborsClassifier(),
    "Decision tree": DecisionTreeClassifier(),
    "SVM (rbf)": SVC(),
    "SVM (linear)": SVC(kernel="linear"),
    "SVM (polynomial)": SVC(kernel="poly"),
    "Random forest": RandomForestClassifier(random_state=RANDOM_STATE),
    "Adaboost": AdaBoostClassifier(),
    "Bagging": BaggingClassifier(random_state=RANDOM_STATE),
    "MLP": MLPClassifier(max_iter=5000, hidden_layer_sizes=(13,20,20,10,3)),
    "Ridge Classifier": RidgeClassifierCV()
}

best_classifier_name = ""
best_classifier = None
best_score = 0
cross_validation = StratifiedKFold(shuffle=True, random_state=RANDOM_STATE, n_splits=10)

for classifier_name, classifier in classifiers.items():
    
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', classifier)])
    classifier_score = np.average(cross_val_score(pipeline, X_train, y_train, cv=cross_validation))

    print(f"{classifier_name}'s score : {classifier_score}")
    
    if classifier_score> best_score:
        best_score = classifier_score
        best_classifier = pipeline
        best_classifier_name = classifier_name

print(f"\n Best classifier : {best_classifier_name}")

Gradient Boosting's score : 0.8392
K-neighbours's score : 0.5198
Decision tree's score : 0.7615999999999999
SVM (rbf)'s score : 0.5982000000000001
SVM (linear)'s score : 0.5626
SVM (polynomial)'s score : 0.5277999999999999
Random forest's score : 0.8393999999999998




Adaboost's score : 0.735
Bagging's score : 0.8173999999999999
MLP's score : 0.6516
Ridge Classifier's score : 0.6003999999999999

 Best classifier : Random forest


In [14]:
from sklearn.model_selection import RandomizedSearchCV

pipeline = Pipeline([('preprocessor', preprocessor), ('forest', RandomForestClassifier(random_state=RANDOM_STATE))])

params = { 
 'forest__bootstrap': [True, False],
 'forest__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
 'forest__max_features': ['log2', 'sqrt'],
 'forest__min_samples_leaf': [1, 2, 4],
 'forest__min_samples_split': [2, 5, 10],
 'forest__n_estimators': [100, 200, 400, 600, 1000, 1200, 1400, 1600, 1800, 2000, 2200, 2400]
}

#Since we can look for all the parameters, because it would be too long, we use RandomizedSearchCV in order to randomly select 120 parameters combinaisons

forest_search = RandomizedSearchCV(pipeline, param_distributions=params, n_iter=120, verbose=1, n_jobs=-1, cv=cross_validation)
forest_search.fit(X_train, y_train)
forest_search.best_score_

Fitting 10 folds for each of 120 candidates, totalling 1200 fits


np.float64(0.8423999999999999)

In [15]:
pipeline = Pipeline([('preprocessor', preprocessor),('gb', GradientBoostingClassifier(random_state=RANDOM_STATE))])
params = {
    "gb__n_estimators" : [50, 100, 200, 300],
    "gb__max_depth" : [5, 10, 20, None],
    "gb__learning_rate" : [0.1, 0.2, 0.01, 0.05],
}

gb_grid = GridSearchCV(pipeline, param_grid=params, verbose=1, n_jobs=-1, cv=cross_validation)
gb_grid.fit(X_train, y_train)
gb_grid.best_score_

NameError: name 'GridSearchCV' is not defined