In [187]:
import pandas as pd
import numpy as np
import os 
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [188]:
df = pd.read_csv('train.csv')

Convert stage fear and drained after socializing to numerical values to work with logistic regression

In [189]:
df['Stage_fear'] = df['Stage_fear'].map({'Yes': 1, 'No': 0})
df['Drained_after_socializing'] = df['Drained_after_socializing'].map({'Yes': 1, 'No': 0})





Find nan in columns

In [190]:
nan_cols = df.columns[df.isna().any()]
print(df[nan_cols].isna().sum())
print(df[nan_cols].dtypes)

Time_spent_Alone             1190
Stage_fear                   1893
Social_event_attendance      1180
Going_outside                1466
Drained_after_socializing    1149
Friends_circle_size          1054
Post_frequency               1264
dtype: int64
Time_spent_Alone             float64
Stage_fear                   float64
Social_event_attendance      float64
Going_outside                float64
Drained_after_socializing    float64
Friends_circle_size          float64
Post_frequency               float64
dtype: object


Fill by means(works because these are continuous and not discrete)

In [191]:
saved_means = {}
for col in nan_cols:
    mean_col = df[col].mean()
    df[col] = df[col].fillna(mean_col)
    saved_means[col] = mean_col
print(saved_means)

{'Time_spent_Alone': np.float64(3.1377639321564557), 'Stage_fear': np.float64(0.241837532319163), 'Social_event_attendance': np.float64(5.265106088560886), 'Going_outside': np.float64(4.044319380935631), 'Drained_after_socializing': np.float64(0.23378417266187052), 'Friends_circle_size': np.float64(7.996737263880939), 'Post_frequency': np.float64(4.982097334878332)}


In [192]:
df.head()

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0.0,0.0,6.0,4.0,0.0,15.0,5.0,Extrovert
1,1,1.0,0.0,7.0,3.0,0.0,10.0,8.0,Extrovert
2,2,6.0,1.0,1.0,0.0,0.233784,3.0,0.0,Introvert
3,3,3.0,0.0,7.0,3.0,0.0,11.0,5.0,Extrovert
4,4,1.0,0.0,4.0,4.0,0.0,13.0,4.982097,Extrovert


In [193]:

X = df.drop(columns=['id', 'Personality'])
y = df['Personality']

le = LabelEncoder()
y_enc = le.fit_transform(y)

print(le.classes_)

X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.2, random_state = 43)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

['Extrovert' 'Introvert']


In [194]:
print(y.value_counts(normalize=True) * 100)

Personality
Extrovert    73.95271
Introvert    26.04729
Name: proportion, dtype: float64


So data is highly unbalanced. We will need to balance the models somehow.
Just doing under sampling is not suggested as well will be destroying a lot of data. 
Instead we will use a parameter called class_weight = 'balanced' to balance the data as it makes sure the labels and their frequency are both considered in the training.


In [195]:
lr_model = LogisticRegression(class_weight='balanced')
lr_model.fit(X_train_scaled, y_train)

In [198]:
y_lr_pred = lr_model.predict(X_test_scaled)

In [199]:
print(classification_report(y_test, y_lr_pred, target_names=le.classes_))

              precision    recall  f1-score   support

   Extrovert       0.98      0.97      0.98      2737
   Introvert       0.93      0.93      0.93       968

    accuracy                           0.96      3705
   macro avg       0.95      0.95      0.95      3705
weighted avg       0.96      0.96      0.96      3705



Now we try decision tree

In [201]:
param_grid = {'max_depth':range(2,21,2), 'min_samples_leaf':range(5,30,5) }


In [209]:
print('Running grid search for Decision Tree...')
dt_model = DecisionTreeClassifier(random_state=43,class_weight='balanced')
dt_grid_search = GridSearchCV(dt_model, param_grid, cv=5, scoring='neg_mean_squared_error')
dt_grid_search.fit(X_train_scaled, y_train)
print('Best parameters for Decision Tree:', dt_grid_search.best_params_)

Running grid search for Decision Tree...
Best parameters for Decision Tree: {'max_depth': 6, 'min_samples_leaf': 5}


In [210]:
dt_model = DecisionTreeClassifier(max_depth = 6, min_samples_leaf = 5,class_weight='balanced', random_state=43)
dt_model.fit(X_train_scaled,y_train)

In [211]:
y_dt_pred = dt_model.predict(X_test_scaled)

In [212]:
print(classification_report(y_test, y_dt_pred, target_names=le.classes_))

              precision    recall  f1-score   support

   Extrovert       0.98      0.97      0.97      2737
   Introvert       0.92      0.93      0.93       968

    accuracy                           0.96      3705
   macro avg       0.95      0.95      0.95      3705
weighted avg       0.96      0.96      0.96      3705



In conclusion, as we can see, that the decision tree model and the logistic regression model both give similar results. From this we can conclude that the data is highly linear and can be sperated linearly into 2 classes. 
Both these models perfrom well even with unbalanced data (), because we are balancing the weights by using the class_weights = balanced.

We know this because the precision, recall and f-1 score for both classes are very high on the test data.


Since logistic regression is faster, we will use that for our final submission

Thus we shall use the dt model for the final submission as it takes less compute than the rf model when training and during predicition.

In [215]:
df_test = pd.read_csv('test.csv')
df_test['Stage_fear'] = df_test['Stage_fear'].map({'Yes': 1, 'No': 0})
df_test['Drained_after_socializing'] = df_test['Drained_after_socializing'].map({'Yes': 1, 'No': 0})



ids = df_test['id']
X_test_final = df_test.drop(columns=['id'])

# Fill missing values in the test set using the saved means from the training set
for col in X_test_final.columns:
    if col in saved_means:
        X_test_final[col] = X_test_final[col].fillna(saved_means[col])

#scale the test set
X_test_final_scaled = scaler.transform(X_test_final)

y_test_final = lr_model.predict(X_test_final_scaled)
y_test_final = le.inverse_transform(y_test_final.astype(int))
submission = pd.DataFrame({'id': ids, 'Personality': y_test_final})
submission.to_csv('submission.csv', index=False)
