# First ML Project (Testing Scikit-Learn)

The goal of this machine learning model is to predict if an individual has more or less friends, based on the data from Kaggle's personality dataset, which tracks various features specific to introverts and extroverts.

In [1]:
# Importing packages
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
# Reading in the csv file
personalities = pd.read_csv('/Users/alexmohamed/Desktop/personality_datasert.csv', sep=',')

In [3]:
# Checking the data is now available for the model
personalities.head()

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,4.0,No,4.0,6.0,No,13.0,5.0,Extrovert
1,9.0,Yes,0.0,0.0,Yes,0.0,3.0,Introvert
2,9.0,Yes,1.0,2.0,Yes,5.0,2.0,Introvert
3,0.0,No,6.0,7.0,No,14.0,8.0,Extrovert
4,3.0,No,9.0,4.0,No,8.0,5.0,Extrovert


In [4]:
# Ensuring no null values are present in the data
personalities.isnull().sum()

Time_spent_Alone             0
Stage_fear                   0
Social_event_attendance      0
Going_outside                0
Drained_after_socializing    0
Friends_circle_size          0
Post_frequency               0
Personality                  0
dtype: int64

In [5]:
personalities['Friends_circle_size'] = pd.to_numeric(
    personalities['Friends_circle_size'], errors='coerce'
)

In [6]:
personalities = personalities.dropna(subset=['Friends_circle_size'])

In [7]:
bins = (0.0, 4.0, 16.0)
labels = ['less_friends', 'more_friends']
personalities['Friends_circle_size'] = pd.cut(
    personalities['Friends_circle_size'],
    bins=bins,
    labels=labels,
    right=False
)
personalities['Friends_circle_size'].unique()

['more_friends', 'less_friends']
Categories (2, object): ['less_friends' < 'more_friends']

In [8]:
label_friends = LabelEncoder()

In [9]:
personalities['Friends_circle_size'] = label_friends.fit_transform(personalities['Friends_circle_size'])

In [10]:
personalities.head(20)

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,4.0,No,4.0,6.0,No,1,5.0,Extrovert
1,9.0,Yes,0.0,0.0,Yes,0,3.0,Introvert
2,9.0,Yes,1.0,2.0,Yes,1,2.0,Introvert
3,0.0,No,6.0,7.0,No,1,8.0,Extrovert
4,3.0,No,9.0,4.0,No,1,5.0,Extrovert
5,1.0,No,7.0,5.0,No,1,6.0,Extrovert
6,4.0,No,9.0,3.0,No,1,7.0,Extrovert
7,2.0,No,8.0,4.0,No,1,8.0,Extrovert
8,10.0,Yes,1.0,3.0,Yes,0,3.0,Introvert
9,0.0,No,8.0,6.0,No,1,8.0,Extrovert


In [11]:
personalities['Friends_circle_size'].value_counts()

Friends_circle_size
1    1956
0     944
Name: count, dtype: int64

In [12]:
# Encoding string features to numbers
personalities['Stage_fear'] = personalities['Stage_fear'].str.strip().str.lower().map({'yes': 1, 'no': 0})
personalities['Drained_after_socializing'] = personalities['Drained_after_socializing'].str.strip().str.lower().map({'yes': 1, 'no': 0})
personalities['Personality'] = personalities['Personality'].str.strip().str.lower().map({'extrovert': 1, 'introvert': 0})

In [13]:
personalities.head()

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,4.0,0,4.0,6.0,0,1,5.0,1
1,9.0,1,0.0,0.0,1,0,3.0,0
2,9.0,1,1.0,2.0,1,1,2.0,0
3,0.0,0,6.0,7.0,0,1,8.0,1
4,3.0,0,9.0,4.0,0,1,5.0,1


In [14]:
# Separating the dataset into variables
X = personalities.drop('Friends_circle_size', axis = 1)
y = personalities['Friends_circle_size']

In [15]:
# Train-and-test splitting of data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [16]:
# Applying Standard Scaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Using the Random Forest Classifier

In [17]:
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train, y_train)
pred_rfc = rfc.predict(X_test)

In [18]:
pred_rfc[:20]

array([1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0])

In [19]:
# Checking how the model performed
print(classification_report(y_test, pred_rfc))
print(confusion_matrix(y_test, pred_rfc))

              precision    recall  f1-score   support

           0       0.68      0.79      0.74       189
           1       0.89      0.82      0.86       391

    accuracy                           0.81       580
   macro avg       0.79      0.81      0.80       580
weighted avg       0.82      0.81      0.82       580

[[150  39]
 [ 69 322]]


# Using the SVM Classifier

In [20]:
clf = svm.SVC()
clf.fit(X_train, y_train)
pred_clf = clf.predict(X_test)

In [21]:
# Checking how the model performed
print(classification_report(y_test, pred_clf))
print(confusion_matrix(y_test, pred_clf))

              precision    recall  f1-score   support

           0       0.66      0.99      0.80       189
           1       1.00      0.76      0.86       391

    accuracy                           0.83       580
   macro avg       0.83      0.88      0.83       580
weighted avg       0.89      0.83      0.84       580

[[188   1]
 [ 95 296]]


# Using the Neural Network

In [22]:
mlpc = MLPClassifier(hidden_layer_sizes = (11,11,11), max_iter=500)
mlpc.fit(X_train, y_train)
pred_mlpc = mlpc.predict(X_test)

In [23]:
# Checking how the model performed
print(classification_report(y_test, pred_mlpc))
print(confusion_matrix(y_test, pred_mlpc))

              precision    recall  f1-score   support

           0       0.66      0.96      0.78       189
           1       0.98      0.76      0.85       391

    accuracy                           0.83       580
   macro avg       0.82      0.86      0.82       580
weighted avg       0.87      0.83      0.83       580

[[182   7]
 [ 94 297]]


# Using Metrics to Check Model Accuracy

In [24]:
from sklearn.metrics import accuracy_score
rfc_acc = accuracy_score(y_test, pred_rfc)
rfc_acc

0.8137931034482758

In [25]:
svm_acc = accuracy_score(y_test, pred_clf)
svm_acc

0.8344827586206897

In [26]:
mlpc_acc = accuracy_score(y_test, pred_mlpc)
mlpc_acc

0.8258620689655173