In [1]:
#!pip install pycaret
#!pip install pycaret --upgrade

import pandas as pd
from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyClassifier
#from pycaret.classification import *

# import mlflow
# mlflow.set_tracking_uri("file:./mlruns")
# mlflow.set_experiment("testSocialSphere1")


In [2]:
# Load dataset
df = pd.read_csv('../data/ssma.csv')


In [3]:
df.head()

Unnamed: 0,Student_ID,Age,Gender,Academic_Level,Country,Avg_Daily_Usage_Hours,Most_Used_Platform,Affects_Academic_Performance,Sleep_Hours_Per_Night,Mental_Health_Score,Relationship_Status,Conflicts_Over_Social_Media,Addicted_Score
0,1,19,Female,Undergraduate,Bangladesh,5.2,Instagram,Yes,6.5,6,In Relationship,3,8
1,2,22,Male,Graduate,India,2.1,Twitter,No,7.5,8,Single,0,3
2,3,20,Female,Undergraduate,USA,6.0,TikTok,Yes,5.0,5,Complicated,4,9
3,4,18,Male,High School,UK,3.0,YouTube,No,7.0,7,Single,1,4
4,5,21,Male,Graduate,Canada,4.5,Facebook,Yes,6.0,6,In Relationship,2,7


In [4]:
df.shape

(705, 13)

## Data Preprocessing

In [5]:
# Dropping the Student_ID column
df.drop(['Student_ID'], axis = 1, inplace = True)

#### Encoding - Gender

In [6]:
df['Gender'].unique()

array(['Female', 'Male'], dtype=object)

In [7]:
df['Gender'] = df['Gender'].apply(lambda x: 0 if x== 'Female' else 1)
#df['Gender'].head()

#### Encoding - Affects_Academic_Performance

In [8]:
df['Affects_Academic_Performance'].unique()

array(['Yes', 'No'], dtype=object)

In [9]:
df['Affects_Academic_Performance'] = df['Affects_Academic_Performance'].apply(lambda x: 0 if x== 'No' else 1)

#### Encoding - Academic_Level

In [10]:
df['Academic_Level'].unique()

array(['Undergraduate', 'Graduate', 'High School'], dtype=object)

In [11]:
academic_level_dummies = pd.get_dummies(df['Academic_Level'], drop_first = True, dtype='int')

In [12]:
df = pd.concat([academic_level_dummies, df], axis = 1)
df.drop(['Academic_Level'], axis = 1, inplace = True)

#### Encoding - Relationship Status

In [13]:
df['Relationship_Status'].unique()

array(['In Relationship', 'Single', 'Complicated'], dtype=object)

In [14]:
relationship_status_dummies = pd.get_dummies(df['Relationship_Status'], drop_first = True, dtype='int')

In [15]:
df = pd.concat([relationship_status_dummies, df], axis = 1)
df.drop(['Relationship_Status'], axis = 1, inplace = True)

#### Encoding - Most_Used_Platform

In [16]:
df['Most_Used_Platform'].unique()

array(['Instagram', 'Twitter', 'TikTok', 'YouTube', 'Facebook',
       'LinkedIn', 'Snapchat', 'LINE', 'KakaoTalk', 'VKontakte',
       'WhatsApp', 'WeChat'], dtype=object)

In [17]:
# Using OneHotEncoder from sklearn
from sklearn.preprocessing import OneHotEncoder

# Create the encoder
encoder = OneHotEncoder(sparse_output=False)

# Reshape the column to 2D array as required by the encoder
platform_encoded = encoder.fit_transform(df[['Most_Used_Platform']])

# Create a DataFrame with the encoded values
platform_encoded_df = pd.DataFrame(
    platform_encoded,
    columns=[f'Platform_{platform}' for platform in encoder.categories_[0]]
)

# If you want to add these encoded columns back to your original dataframe
df_encoded = pd.concat([platform_encoded_df, df], axis=1)
df.drop(['Most_Used_Platform'], axis = 1, inplace = True)


#### Encoding - Country

In [18]:
#df['Country'].unique()
df['Country'].nunique() # showing below over 100 countries

110

In [19]:
from sklearn.preprocessing import OneHotEncoder

# Create the encoder
country_encoder = OneHotEncoder(sparse_output=False)

# Fit and transform the 'Country' column
country_encoded = country_encoder.fit_transform(df[['Country']])

# Create a DataFrame with the encoded values
country_encoded_df = pd.DataFrame(
    country_encoded,
    columns=[f'Country_{country}' for country in country_encoder.categories_[0]]
)

# Concatenate with the original DataFrame
df_encoded = pd.concat([country_encoded_df, df], axis=1)
df_encoded.drop(['Country'], axis = 1, inplace = True)



In [20]:
df_encoded.shape  #Shape here is 705 rows and 122 columns

(705, 122)

In [21]:
df_encoded.head()

Unnamed: 0,Country_Afghanistan,Country_Albania,Country_Andorra,Country_Argentina,Country_Armenia,Country_Australia,Country_Austria,Country_Azerbaijan,Country_Bahamas,Country_Bahrain,...,High School,Undergraduate,Age,Gender,Avg_Daily_Usage_Hours,Affects_Academic_Performance,Sleep_Hours_Per_Night,Mental_Health_Score,Conflicts_Over_Social_Media,Addicted_Score
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,19,0,5.2,1,6.5,6,3,8
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,22,1,2.1,0,7.5,8,0,3
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,20,0,6.0,1,5.0,5,4,9
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,18,1,3.0,0,7.0,7,1,4
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,21,1,4.5,1,6.0,6,2,7


The encoded dataframe 'df_encoded' whole still having 705 observations now has 122 columns as result of the encoding

In [22]:
df_encoded.shape  #Shape here is 705 rows and 122 columns

(705, 122)

## Creating the Training Set and the Test Set

#### Getting the inputs and output

In [23]:
X = df_encoded.iloc[:, :-1].values
y = df_encoded.iloc[:, -1].values

#### Getting the Train & Test Sets

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [25]:
df_encoded.shape  #Shape here is 705 rows and 122 columns

(705, 122)

### Compairing and Looking for right Model

In [26]:
# Encode the target variable
from sklearn.preprocessing import LabelEncoder

# Create and fit the label encoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data with encoded target
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=0)

# Custom Model Comparison
from sklearn.linear_model import LogisticRegression
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import time

# Define models to compare
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
     #xgboost.XGBRegressor(max_depth = 2, learning_rate = 0.1, n_estimators = 100)
    #'XGB Regressor': XGBRegressor(n_estimators = 100),
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100),
    'SVM': SVC(probability=True),
    'KNN': KNeighborsClassifier(n_neighbors=5)
}

# Dictionary to store results
results = {}

# Train and evaluate each model
for name, model in models.items():
    print(f"\nTraining {name}...")
    start_time = time.time()
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # Store results
    results[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Time': time.time() - start_time
    }
    
    print(f"{name} completed in {results[name]['Time']:.2f} seconds")

# Convert results to DataFrame for better visualization
results_df = pd.DataFrame(results).T
results_df = results_df.sort_values('Accuracy', ascending=False)

# Display results
print("\nModel Comparison Results:")
print(results_df)

# Display class mapping
print("\nClass Mapping:")
for i, class_name in enumerate(label_encoder.classes_):
    print(f"{i}: {class_name}")


Training Logistic Regression...
Logistic Regression completed in 1.08 seconds

Training Random Forest...
Random Forest completed in 0.29 seconds

Training Gradient Boosting...
Gradient Boosting completed in 1.95 seconds

Training SVM...
SVM completed in 0.16 seconds

Training KNN...
KNN completed in 2.04 seconds

Model Comparison Results:
                     Accuracy  Precision  Recall  F1 Score  Time
Random Forest            0.96       0.96    0.96      0.96  0.29
Logistic Regression      0.95       0.95    0.95      0.95  1.08
Gradient Boosting        0.95       0.95    0.95      0.95  1.95
KNN                      0.95       0.96    0.95      0.95  2.04
SVM                      0.62       0.44    0.62      0.51  0.16

Class Mapping:
0: 2
1: 3
2: 4
3: 5
4: 6
5: 7
6: 8
7: 9


### Building and Training the model

In [27]:
#X

In [28]:
#y

Experiment 1: Train Logistic Regression Classifier

In [29]:
from sklearn.metrics import classification_report

params = {
    "solver": "lbfgs",
    "max_iter": 1000,
    "multi_class": "auto",
    "random_state": 8888,
}

log_reg = LogisticRegression(**params)
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)


report = classification_report(y_test, y_pred_log_reg)
print(report)

              precision    recall  f1-score   support

           1       0.75      0.75      0.75         4
           2       0.95      1.00      0.98        21
           3       0.95      0.95      0.95        22
           4       1.00      0.90      0.95        10
           5       1.00      0.98      0.99        50
           6       0.86      1.00      0.93        25
           7       1.00      0.67      0.80         9

    accuracy                           0.95       141
   macro avg       0.93      0.89      0.91       141
weighted avg       0.95      0.95      0.95       141



In [30]:
report_dict = classification_report(y_test, y_pred_log_reg, output_dict=True)
report_dict

{'1': {'precision': 0.75, 'recall': 0.75, 'f1-score': 0.75, 'support': 4.0},
 '2': {'precision': 0.9545454545454546,
  'recall': 1.0,
  'f1-score': 0.9767441860465116,
  'support': 21.0},
 '3': {'precision': 0.9545454545454546,
  'recall': 0.9545454545454546,
  'f1-score': 0.9545454545454546,
  'support': 22.0},
 '4': {'precision': 1.0,
  'recall': 0.9,
  'f1-score': 0.9473684210526315,
  'support': 10.0},
 '5': {'precision': 1.0,
  'recall': 0.98,
  'f1-score': 0.98989898989899,
  'support': 50.0},
 '6': {'precision': 0.8620689655172413,
  'recall': 1.0,
  'f1-score': 0.9259259259259259,
  'support': 25.0},
 '7': {'precision': 1.0,
  'recall': 0.6666666666666666,
  'f1-score': 0.8,
  'support': 9.0},
 'accuracy': 0.950354609929078,
 'macro avg': {'precision': 0.9315942678011643,
  'recall': 0.893030303030303,
  'f1-score': 0.9063547110670732,
  'support': 141.0},
 'weighted avg': {'precision': 0.9545899197403233,
  'recall': 0.950354609929078,
  'f1-score': 0.9491376578766006,
  'supp

In [31]:
import mlflow

In [33]:
mlflow.set_experiment('first_trial_experiment')
# mlflow.set_tracking_uri('file:./mlruns') # http://127.0.0.1:5000
mlflow.set_tracking_uri(uri='http://127.0.0.1:5000/') # mlflow.set_tracking_uri(uri="http://127.0.0.1:5000/")

with mlflow.start_run():
    mlflow.log_params(params)
    mlflow.log_metrics({
        'accuracy': report_dict['accuracy'],
        'recall_class_1': report_dict['1']['recall'],
        'recall_class_2': report_dict['2']['recall'],
        'f1_score_macro': report_dict['macro avg']['f1-score']
    })
    mlflow.sklearn.log_model(log_reg, "log_reg")

   # mlflow.xgboost.log_model(model, "xgboost")




🏃 View run receptive-gnat-185 at: http://127.0.0.1:5000/#/experiments/355431485210658056/runs/15a3bc10c88b4ad2b9382d94fb0d92f8
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/355431485210658056


In [34]:
# import xgboost
# model = xgboost.XGBRegressor(max_depth = 2, learning_rate = 0.1, n_estimators = 100)
# model.fit(X_train, y_train)

#### Alternative Approach to Encoding

In [35]:
# from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
# from sklearn.compose import ColumnTransformer

# # Define transformers
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('binary', OrdinalEncoder(), ['Gender', 'Affects_Academic_Performance']),
#         ('ordinal', OrdinalEncoder(categories=[['High School', 'Undergraduate', 'Graduate']]), ['Academic_Level']),
#         ('onehot', OneHotEncoder(), ['Relationship_Status'])
#     ])

# # Apply preprocessing
# X_transformed = preprocessor.fit_transform(X)
