In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [2]:
# Load the data
train = pd.read_csv('/kaggle/input/playground-series-s5e7/train.csv')  
test = pd.read_csv('/kaggle/input/playground-series-s5e7/test.csv')
sample = pd.read_csv('/kaggle/input/playground-series-s5e7/sample_submission.csv')

In [3]:
train.isna().sum()

id                              0
Time_spent_Alone             1190
Stage_fear                   1893
Social_event_attendance      1180
Going_outside                1466
Drained_after_socializing    1149
Friends_circle_size          1054
Post_frequency               1264
Personality                     0
dtype: int64

In [4]:
test.isna().sum()

id                             0
Time_spent_Alone             425
Stage_fear                   598
Social_event_attendance      397
Going_outside                466
Drained_after_socializing    432
Friends_circle_size          350
Post_frequency               408
dtype: int64

## Numerical data + Handle missing Numerical data

In [5]:
# Handle missing data for numerical columns
numerical_cols = train.select_dtypes(include=['int64', 'float64']).columns

for col in numerical_cols:
    train[col].fillna(train[col].mean(), inplace=True) 

for col in numerical_cols:
    test[col].fillna(test[col].mean(), inplace=True) 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna(train[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[col].fillna(test[col].mean(), inplace=True)


In [6]:
test[numerical_cols].isna().sum()

id                         0
Time_spent_Alone           0
Social_event_attendance    0
Going_outside              0
Friends_circle_size        0
Post_frequency             0
dtype: int64

In [7]:
train[numerical_cols].isna().sum()

id                         0
Time_spent_Alone           0
Social_event_attendance    0
Going_outside              0
Friends_circle_size        0
Post_frequency             0
dtype: int64

## Categorical data + handle missing categorical data

In [8]:
X = train.drop('Personality', axis=1)  # Features
y = train['Personality']  # Target variable

In [9]:
X.head(3)

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
0,0,0.0,No,6.0,4.0,No,15.0,5.0
1,1,1.0,No,7.0,3.0,No,10.0,8.0
2,2,6.0,Yes,1.0,0.0,,3.0,0.0


In [10]:
# Handle missing data for categorical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns

for col in categorical_cols:
    # Fill with the most frequent value
    X[col].fillna(X[col].mode()[0], inplace=True)
for col in categorical_cols:
    # Fill with the most frequent value
    test[col].fillna(test[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(X[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[col].fillna(test[col].mode()[0], inplace=True)


In [11]:
X[categorical_cols].isna().sum()

Stage_fear                   0
Drained_after_socializing    0
dtype: int64

In [12]:
test[categorical_cols].isna().sum()

Stage_fear                   0
Drained_after_socializing    0
dtype: int64

## Convert to numerical 

In [13]:
# One-hot encoding for categorical variables
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
test = pd.get_dummies(test, columns=categorical_cols, drop_first=True)

In [14]:
X.head(3)

Unnamed: 0,id,Time_spent_Alone,Social_event_attendance,Going_outside,Friends_circle_size,Post_frequency,Stage_fear_Yes,Drained_after_socializing_Yes
0,0,0.0,6.0,4.0,15.0,5.0,False,False
1,1,1.0,7.0,3.0,10.0,8.0,False,False
2,2,6.0,1.0,0.0,3.0,0.0,True,False


In [15]:
test.head(3)

Unnamed: 0,id,Time_spent_Alone,Social_event_attendance,Going_outside,Friends_circle_size,Post_frequency,Stage_fear_Yes,Drained_after_socializing_Yes
0,18524,3.0,7.0,4.0,6.0,5.028958,False,False
1,18525,3.11687,0.0,0.0,5.0,1.0,True,True
2,18526,3.0,5.0,6.0,15.0,9.0,False,False


In [16]:
Y = pd.DataFrame(y)
Y

Unnamed: 0,Personality
0,Extrovert
1,Extrovert
2,Introvert
3,Extrovert
4,Extrovert
...,...
18519,Extrovert
18520,Extrovert
18521,Introvert
18522,Introvert


In [17]:
label_encoder = LabelEncoder()
Y['Personality'] = label_encoder.fit_transform(Y['Personality'])

In [18]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [19]:
model = xgb.XGBClassifier(max_depth=200,learning_rate=0.01,n_estimators=300)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))
print(classification_report(y_test, y_pred))

Accuracy: 0.9667146455559554
Precision: 0.9665615466573811
Recall: 0.9667146455559554
              precision    recall  f1-score   support

           0       0.97      0.98      0.98      4115
           1       0.94      0.93      0.94      1443

    accuracy                           0.97      5558
   macro avg       0.96      0.95      0.96      5558
weighted avg       0.97      0.97      0.97      5558



In [20]:
prediction = model.predict(test)

In [21]:
sample.head(3)

Unnamed: 0,id,Personality
0,18524,Extrovert
1,18525,Extrovert
2,18526,Extrovert


In [22]:
original_labels = label_encoder.inverse_transform(prediction)


In [23]:
sample['Personality'] = original_labels
sample.head(3)

Unnamed: 0,id,Personality
0,18524,Extrovert
1,18525,Introvert
2,18526,Extrovert


In [24]:
sample.to_csv('submission.csv',index=False)