In [8]:
import pandas as pd

In [9]:
df = pd.read_csv('df_final.csv')

In [14]:
df.drop(columns=['Unnamed: 0'],inplace=True)

In [15]:
from sklearn.model_selection import train_test_split

In [24]:
from sklearn.preprocessing import LabelEncoder

x = df.drop(columns=['Personality'])

le = LabelEncoder()
y = le.fit_transform(df['Personality'])

In [25]:
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [34]:
y_train

array([0, 0, 0, ..., 0, 0, 0])

In [26]:
x_train.head()

Unnamed: 0,Time_spent_Alone,Social_event_attendance,Going_outside,Friends_circle_size,Post_frequency,Stage_fear1,Drained_after_socializing1
1799,1.0,7.0,4.0,10.0,5.0,0,0
11931,2.0,4.0,6.0,6.0,8.0,0,0
14307,4.0,5.0,5.0,7.0,6.0,0,0
12157,3.0,6.0,4.044319,8.0,8.0,0,0
18124,2.0,7.0,7.0,15.0,4.0,0,0


In [27]:
df['Drained_after_socializing1'].unique()

array([0, 1], dtype=int64)

In [28]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import PowerTransformer
import numpy as np

In [35]:
function_transformer = ['Time_spent_Alone']
power_transformer = [col for col in x.columns if col not in function_transformer]

trf = ColumnTransformer([
   ('function_transformer', FunctionTransformer(np.log1p), function_transformer),
   ('power_transformer', PowerTransformer(method='yeo-johnson'), power_transformer)
],remainder='passthrough')


In [36]:
x_train_trf = trf.fit_transform(x_train)
x_test_trf = trf.transform(x_test)


In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

model = LogisticRegression()
model.fit(x_train_trf, y_train)

y_pred = model.predict(x_test_trf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9686909581646423
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      2753
           1       0.95      0.93      0.94       952

    accuracy                           0.97      3705
   macro avg       0.96      0.96      0.96      3705
weighted avg       0.97      0.97      0.97      3705



In [39]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Step 1: Define a pipeline (transform + model)
pipe = Pipeline([
    ('preprocessing', trf),
    ('model', LogisticRegression())
])

# Step 2: Perform cross-validation (e.g., 5-fold)
scores = cross_val_score(pipe, x, y, cv=15, scoring='accuracy')

# Step 3: Output results
print("Cross-validation scores:", scores)
print("Mean Accuracy:", scores.mean())
print("Standard Deviation:", scores.std())


Cross-validation scores: [0.97246964 0.96275304 0.97165992 0.96761134 0.96842105 0.9757085
 0.95789474 0.96923077 0.96518219 0.96842105 0.95951417 0.97408907
 0.96842105 0.97651822 0.97568882]
Mean Accuracy: 0.9689055702465239
Standard Deviation: 0.005544015192505647


In [61]:
custom_input = pd.DataFrame([{
    'Time_spent_Alone': 7.0,
    'Social_event_attendance': 2,
    'Going_outside': 1,
    'Friends_circle_size': 4,
    'Post_frequency': 3,
    'Stage_fear1': 1,
    'Drained_after_socializing1': 1
}])

In [62]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

pipe = Pipeline([
    ('preprocessing', trf),
    ('model', LogisticRegression())
])

pipe.fit(x_train, y_train)
pred = pipe.predict(custom_input)


In [63]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(df['Personality'])  # Original column before label encoding

predicted_label = le.inverse_transform(pred)

print("Predicted Personality:", predicted_label[0])


Predicted Personality: Introvert


In [64]:
## test data 

In [None]:
# 1. Load the test data
test_df = pd.read_csv('test_data.csv')

# 2. Predict on test data
y_pred = pipe.predict(test_df)

# 3. Decode predictions if LabelEncoder was used
y_pred_labels = le.inverse_transform(y_pred)

# 4. Add predictions to the test DataFrame
test_df['Predicted_Personality'] = y_pred_labels

# 5. Display or save
print(test_df)
# test_df.to_csv('test_data_with_predictions.csv', index=False)
