In [44]:
import pandas as pd
import numpy as np
import os 
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import root_mean_squared_error, r2_score

In [80]:
df = pd.read_csv('train.csv')

Convert stage fear and drained after socializing to numerical values to work with logistic regression

In [None]:
df['Stage_fear'] = df['Stage_fear'].map({'Yes': 1, 'No': 0})
df['Drained_after_socializing'] = df['Drained_after_socializing'].map({'Yes': 1, 'No': 0})





(18524, 9)


Find nan in columns

In [84]:
nan_cols = df.columns[df.isna().any()]
print(df[nan_cols].isna().sum())

Time_spent_Alone           1190
Social_event_attendance    1180
Going_outside              1466
Friends_circle_size        1054
Post_frequency             1264
dtype: int64


Fill by mean

In [82]:
df.head()

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0.0,0.0,6.0,4.0,0.0,15.0,5.0,Extrovert
1,1,1.0,0.0,7.0,3.0,0.0,10.0,8.0,Extrovert
2,2,6.0,1.0,1.0,0.0,0.0,3.0,0.0,Introvert
3,3,3.0,0.0,7.0,3.0,0.0,11.0,5.0,Extrovert
4,4,1.0,0.0,4.0,4.0,0.0,13.0,,Extrovert


In [34]:

X = df.drop(columns=['id', 'Personality'])
y = df['Personality']

le = LabelEncoder()
y_enc = le.fit_transform(y)


X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.2, random_state = 43)


In [35]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

In [36]:
y_lr_pred = lr_model.predict(X_test)

In [41]:
lr_rmse = root_mean_squared_error( y_test, y_lr_pred)

lr_r2 = r2_score(y_test,y_lr_pred)

print('[LR] Root Mean Squared Error: {0}'.format(lr_rmse))
print('[LR] R2: {0}'.format(lr_r2))

[LR] Root Mean Squared Error: 0.16772638206009213
[LR] R2: 0.805724153815416


Now we try decision tree

In [59]:
param_grid = {'max_depth':range(2,21,2), 'min_samples_leaf':range(25,50,5) }


In [None]:
print('Running grid search for Decision Tree...')
dt_model = DecisionTreeRegressor(random_state=43)
dt_grid_search = GridSearchCV(dt_model, param_grid, cv=5, scoring='neg_mean_squared_error')
dt_grid_search.fit(X_train, y_train)
print('Best parameters for Decision Tree:', dt_grid_search.best_params_)

Running grid search for Decision Tree...
Best parameters for Decision Tree: {'max_depth': 4, 'min_samples_leaf': 30}


In [63]:
dt_model = DecisionTreeRegressor(max_depth = 4, min_samples_leaf = 30)
dt_model.fit(X_train,y_train)

In [64]:
y_dt_pred = dt_model.predict(X_test)

# 2. Compute the RMSE using mean_squared_error()
dt_rmse = root_mean_squared_error(y_test,y_dt_pred)


# 3. Compute the R2 score using r2_score()
dt_r2 = r2_score(y_test,y_dt_pred)


print('[DT] Root Mean Squared Error: {0}'.format(dt_rmse))
print('[DT] R2: {0}'.format(dt_r2))

[DT] Root Mean Squared Error: 0.16823144714081656
[DT] R2: 0.8045523682048865


Do a random forest ensemble

In [66]:
param_grid = {'max_depth':range(2,20,2), 'min_samples_leaf':range(25,50,5) }
dt_model = RandomForestRegressor(random_state=43)
rf_grid_search = GridSearchCV(dt_model, param_grid, cv=5, scoring='neg_mean_squared_error')
rf_grid_search.fit(X_train, y_train)
print('Best parameters for Random Forest:', rf_grid_search.best_params_)


Best parameters for Random Forest: {'max_depth': 4, 'min_samples_leaf': 25}


In [67]:
rf_model = RandomForestRegressor(max_depth = 4, min_samples_leaf = 25)
rf_model.fit(X_train,y_train)

In [68]:
y_rf_pred = rf_model.predict(X_test)
rf_rmse = root_mean_squared_error(y_test,y_rf_pred)
rf_r2 = r2_score(y_test,y_rf_pred)
print('[RF] Root Mean Squared Error: {0}'.format(rf_rmse))
print('[RF] R2: {0}'.format(rf_r2))

[RF] Root Mean Squared Error: 0.16791415310087013
[RF] R2: 0.8052889235687845


In conclusion, as we can see, the logistic regression model works as well as the decision tree and Random Forests model.

This suggests that the data is highly linear and using more complex models is not worth the compute time.

In [70]:
df_test = pd.read_csv('test.csv')
df_test['Stage_fear'] = df_test['Stage_fear'].map({'Yes': 1, 'No': 0})
df_test['Drained_after_socializing'] = df_test['Drained_after_socializing'].map({'Yes': 1, 'No': 0})

ids = df_test['id']
X_test_final = df_test.drop(columns=['id'])

y_test_final = lr_model.predict(X_test_final)
y_test_final = le.inverse_transform(y_test_final)
output_df = pd.DataFrame({'id': ids, 'Personality': y_test_final})
output_df.to_csv('submission.csv', index=False)
print("Submission file created successfully.")


ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values