In [1]:
from google.colab import files
files.upload()  # Upload kaggle.json

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle competitions download -c playground-series-s5e7

Saving kaggle.json to kaggle.json
Downloading playground-series-s5e7.zip to /content
  0% 0.00/191k [00:00<?, ?B/s]
100% 191k/191k [00:00<00:00, 461MB/s]


In [2]:
!unzip /content/playground-series-s5e7.zip -d /content/playground-series-s5e7

Archive:  /content/playground-series-s5e7.zip
  inflating: /content/playground-series-s5e7/sample_submission.csv  
  inflating: /content/playground-series-s5e7/test.csv  
  inflating: /content/playground-series-s5e7/train.csv  


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [9]:
test_df = pd.read_csv('/content/playground-series-s5e7/test.csv')
test_df.head()

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
0,18524,3.0,No,7.0,4.0,No,6.0,
1,18525,,Yes,0.0,0.0,Yes,5.0,1.0
2,18526,3.0,No,5.0,6.0,No,15.0,9.0
3,18527,3.0,No,4.0,4.0,No,5.0,6.0
4,18528,9.0,Yes,1.0,2.0,Yes,1.0,1.0


In [10]:
sub_df = pd.read_csv('/content/playground-series-s5e7/sample_submission.csv')
sub_df.head()

Unnamed: 0,id,Personality
0,18524,Extrovert
1,18525,Extrovert
2,18526,Extrovert
3,18527,Extrovert
4,18528,Extrovert


In [39]:
# https://www.kaggle.com/code/metric/accuracy-score
import numpy as np
import pandas as pd
import pandas.api.types
from sklearn.metrics import accuracy_score
import sklearn.metrics

from typing import Sequence, Union, Optional


class ParticipantVisibleError(Exception):
    pass


def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str, normalize: bool=True, weights_column_name: Optional[str]=None) -> float:
    '''
    Wrapper for https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html
    Accuracy classification score.

    In multilabel classification, this function computes subset accuracy:
    the set of labels predicted for a sample must *exactly* match the
    corresponding set of labels in y_true.

    Parameters
    ----------
    solution : 1d DataFrame. Ground truth (correct) labels.

    submission : 1d DataFrame. Predicted labels, as returned by a classifier.

    normalize : bool, default=True
        If False, return the number of correctly classified samples.
        Otherwise, return the fraction of correctly classified samples.

    weights_column_name: optional str, the name of the sample weights column in the solution file.

    Examples
    --------
    >>> import pandas as pd
    >>> row_id_column_name = "id"
    >>> y_pred = [0, 2, 1, 3]
    >>> y_pred = pd.DataFrame(y_pred)
    >>> y_pred["id"] = range(len(y_pred))
    >>> y_true = [0, 1, 2, 3]
    >>> y_true = pd.DataFrame(y_true)
    >>> y_true["id"] = range(len(y_true))
    >>> score(y_true.copy(), y_pred.copy(), row_id_column_name)
    0.5
    >>> score(y_true.copy(), y_pred.copy(), row_id_column_name, normalize=False)
    2.0
    '''
    # Skip sorting and equality checks for the row_id_column since that should already be handled
    del solution[row_id_column_name]
    del submission[row_id_column_name]

    sample_weight = None
    if weights_column_name:
        if weights_column_name not in solution.columns:
            raise ValueError(f'The solution weights column {weights_column_name} is not found')
        sample_weight = solution.pop(weights_column_name).values
        if not pandas.api.types.is_numeric_dtype(sample_weight):
            raise ParticipantVisibleError('The solution weights are not numeric')

    if not((len(submission.columns) == 1) or (len(submission.columns) == len(solution.columns))):
        raise ParticipantVisibleError(f'Invalid number of submission columns. Found {len(submission.columns)}')


    solution = solution.values
    submission = submission.values

    try:
        score_result = accuracy_score(solution, submission, normalize=normalize, sample_weight=sample_weight)
    except Exception as e:
        raise ParticipantVisibleError(str(e))

    return float(score_result)

In [11]:
df = pd.read_csv('/content/playground-series-s5e7/train.csv')
df.head(10)

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0.0,No,6.0,4.0,No,15.0,5.0,Extrovert
1,1,1.0,No,7.0,3.0,No,10.0,8.0,Extrovert
2,2,6.0,Yes,1.0,0.0,,3.0,0.0,Introvert
3,3,3.0,No,7.0,3.0,No,11.0,5.0,Extrovert
4,4,1.0,No,4.0,4.0,No,13.0,,Extrovert
5,5,2.0,No,8.0,5.0,No,,3.0,Extrovert
6,6,1.0,No,8.0,,No,,4.0,Extrovert
7,7,2.0,No,8.0,3.0,No,4.0,5.0,Extrovert
8,8,4.0,Yes,2.0,1.0,,0.0,2.0,Introvert
9,9,1.0,No,8.0,6.0,No,14.0,9.0,Extrovert


In [5]:
df.shape

(18524, 9)

In [12]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6175 entries, 0 to 6174
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         6175 non-null   int64  
 1   Time_spent_Alone           5750 non-null   float64
 2   Stage_fear                 5577 non-null   object 
 3   Social_event_attendance    5778 non-null   float64
 4   Going_outside              5709 non-null   float64
 5   Drained_after_socializing  5743 non-null   object 
 6   Friends_circle_size        5825 non-null   float64
 7   Post_frequency             5767 non-null   float64
dtypes: float64(5), int64(1), object(2)
memory usage: 386.1+ KB


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18524 entries, 0 to 18523
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         18524 non-null  int64  
 1   Time_spent_Alone           17334 non-null  float64
 2   Stage_fear                 16631 non-null  object 
 3   Social_event_attendance    17344 non-null  float64
 4   Going_outside              17058 non-null  float64
 5   Drained_after_socializing  17375 non-null  object 
 6   Friends_circle_size        17470 non-null  float64
 7   Post_frequency             17260 non-null  float64
 8   Personality                18524 non-null  object 
dtypes: float64(5), int64(1), object(3)
memory usage: 1.3+ MB


In [7]:
df.describe()

Unnamed: 0,id,Time_spent_Alone,Social_event_attendance,Going_outside,Friends_circle_size,Post_frequency
count,18524.0,17334.0,17344.0,17058.0,17470.0,17260.0
mean,9261.5,3.137764,5.265106,4.044319,7.996737,4.982097
std,5347.562529,3.003786,2.753359,2.06258,4.223484,2.879139
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,4630.75,1.0,3.0,3.0,5.0,3.0
50%,9261.5,2.0,5.0,4.0,8.0,5.0
75%,13892.25,4.0,8.0,6.0,12.0,7.0
max,18523.0,11.0,10.0,7.0,15.0,10.0


In [8]:
for col in df.select_dtypes('object').columns:
  print(f'{col}: {df[col].unique()}')

Stage_fear: ['No' 'Yes' nan]
Drained_after_socializing: ['No' nan 'Yes']
Personality: ['Extrovert' 'Introvert']


## First: Train a basic model on raw data to get initial insights

In [13]:
df.head()

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0.0,No,6.0,4.0,No,15.0,5.0,Extrovert
1,1,1.0,No,7.0,3.0,No,10.0,8.0,Extrovert
2,2,6.0,Yes,1.0,0.0,,3.0,0.0,Introvert
3,3,3.0,No,7.0,3.0,No,11.0,5.0,Extrovert
4,4,1.0,No,4.0,4.0,No,13.0,,Extrovert


In [14]:
df1 = df.copy()

In [16]:
df1['Stage_fear'] = df1['Stage_fear'].apply(lambda x: 1 if x == 'Yes' else 0).astype(int)
df1['Drained_after_socializing'] = df1['Drained_after_socializing'].apply(lambda x: 1 if x == 'Yes' else 0).astype(int)

In [17]:
df1.head()

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0.0,0,6.0,4.0,0,15.0,5.0,Extrovert
1,1,1.0,0,7.0,3.0,0,10.0,8.0,Extrovert
2,2,6.0,1,1.0,0.0,0,3.0,0.0,Introvert
3,3,3.0,0,7.0,3.0,0,11.0,5.0,Extrovert
4,4,1.0,0,4.0,4.0,0,13.0,,Extrovert


In [18]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18524 entries, 0 to 18523
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         18524 non-null  int64  
 1   Time_spent_Alone           17334 non-null  float64
 2   Stage_fear                 18524 non-null  int64  
 3   Social_event_attendance    17344 non-null  float64
 4   Going_outside              17058 non-null  float64
 5   Drained_after_socializing  18524 non-null  int64  
 6   Friends_circle_size        17470 non-null  float64
 7   Post_frequency             17260 non-null  float64
 8   Personality                18524 non-null  object 
dtypes: float64(5), int64(3), object(1)
memory usage: 1.3+ MB


In [22]:
X = df1.drop('Personality', axis=1)
y = df1['Personality']

In [28]:
y = y.apply(lambda x: 1 if x == 'Extrovert' else 0).astype(int)

In [29]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train)
xgb_model.score(X_test, y_test)

0.9678812415654521

In [45]:
y_pred = xgb_model.predict(X_test)
y_pred

array([1, 1, 1, ..., 1, 1, 1])

In [46]:
solution = pd.DataFrame({
    'row_id': X_test.index,
    'Personality': y_test         # ground truth
})

submission = pd.DataFrame({
    'row_id': X_test.index,
    'Personality': y_pred         # model predictions
})

In [47]:
print(score(solution, submission, row_id_column_name="row_id"))

0.9678812415654521


In [48]:
test_df.head()

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
0,18524,3.0,No,7.0,4.0,No,6.0,
1,18525,,Yes,0.0,0.0,Yes,5.0,1.0
2,18526,3.0,No,5.0,6.0,No,15.0,9.0
3,18527,3.0,No,4.0,4.0,No,5.0,6.0
4,18528,9.0,Yes,1.0,2.0,Yes,1.0,1.0


In [49]:
test_df['Stage_fear'] = test_df['Stage_fear'].apply(lambda x: 1 if x == 'Yes' else 0).astype(int)
test_df['Drained_after_socializing'] = test_df['Drained_after_socializing'].apply(lambda x: 1 if x == 'Yes' else 0).astype(int)

In [58]:
submission = pd.DataFrame({
    'id': test_df['id'],
    'Personality': xgb_model.predict(test_df)
})

In [59]:
submission.head()

Unnamed: 0,id,Personality
0,18524,1
1,18525,0
2,18526,1
3,18527,1
4,18528,0


In [60]:
  submission['Personality']=submission['Personality'].map(lambda x: 'Extrovert' if x == 1 else 'Introvert')
  submission.head()

Unnamed: 0,id,Personality
0,18524,Extrovert
1,18525,Introvert
2,18526,Extrovert
3,18527,Extrovert
4,18528,Introvert


In [62]:
submission.to_csv('submission.csv', index=False)