In [36]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns


In [37]:
df = pd.read_csv('Students Social Media Addiction.csv')
df.head()

Unnamed: 0,Student_ID,Age,Gender,Academic_Level,Country,Avg_Daily_Usage_Hours,Most_Used_Platform,Affects_Academic_Performance,Sleep_Hours_Per_Night,Mental_Health_Score,Relationship_Status,Conflicts_Over_Social_Media,Addicted_Score
0,1,19,Female,Undergraduate,Bangladesh,5.2,Instagram,Yes,6.5,6,In Relationship,3,8
1,2,22,Male,Graduate,India,2.1,Twitter,No,7.5,8,Single,0,3
2,3,20,Female,Undergraduate,USA,6.0,TikTok,Yes,5.0,5,Complicated,4,9
3,4,18,Male,High School,UK,3.0,YouTube,No,7.0,7,Single,1,4
4,5,21,Male,Graduate,Canada,4.5,Facebook,Yes,6.0,6,In Relationship,2,7


In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 705 entries, 0 to 704
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Student_ID                    705 non-null    int64  
 1   Age                           705 non-null    int64  
 2   Gender                        705 non-null    object 
 3   Academic_Level                705 non-null    object 
 4   Country                       705 non-null    object 
 5   Avg_Daily_Usage_Hours         705 non-null    float64
 6   Most_Used_Platform            705 non-null    object 
 7   Affects_Academic_Performance  705 non-null    object 
 8   Sleep_Hours_Per_Night         705 non-null    float64
 9   Mental_Health_Score           705 non-null    int64  
 10  Relationship_Status           705 non-null    object 
 11  Conflicts_Over_Social_Media   705 non-null    int64  
 12  Addicted_Score                705 non-null    int64  
dtypes: fl

In [39]:
from sklearn.compose import ColumnTransformer
features = df.drop('Addicted_Score', axis=1) # X-features
target = df['Addicted_Score'] # Y-featire

numeric_features = ['Age', 'Avg_Daily_Usage_Hours', 'Sleep_Hours_Per_Night', 'Mental_Health_Score', 'Conflicts_Over_Social_Media']
categorical_features = ['Gender', 'Academic_Level', 'Most_Used_Platform', 'Relationship_Status', 'Affects_Academic_Performance']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

## Create the final pipeline with the preprocessor and the best model

In [40]:
from sklearn.pipeline import Pipeline
import joblib

# Initializing GradientBoostingRegressor
gb_model = GradientBoostingRegressor(n_estimators=200, random_state=42)

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', gb_model)
])

In [41]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.2, random_state=42)

# Train the pipeline
model_pipeline.fit(X_train, y_train)

# Predict on test set
y_pred = model_pipeline.predict(X_test)

# Evaluate performance
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("\nGradient Boosting Results:")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R² Score: {r2:.4f}")

# Train the full pipeline on the training data
model_pipeline.fit(X_train, y_train)

# Save the entire pipeline to a .joblib file
joblib.dump(model_pipeline, 'addiction_score_pipeline.joblib')


Gradient Boosting Results:
MSE: 0.0514
RMSE: 0.2267
R² Score: 0.9795


['addiction_score_pipeline.joblib']

In [42]:
## Create a function to make predictions on new data
def predict_addiction_score(new_data):
    
   # Predicts the Addicted_Score for new data using the saved pipeline.
   
    # Convert the new data into a DataFrame
    new_data_df = pd.DataFrame(new_data)
    
    # Use the trained pipeline to make predictions
    predictions = model_pipeline.predict(new_data_df)
    
    return predictions


- **Purpose**:
    
    Takes new user-provided data and predicts the **Addicted_Score** using your already trained **model pipeline**.
    
- **Steps**:
    - Converts `new_data` (a dictionary list) into a **Pandas DataFrame** → because ML models expect data in tabular format.
    - Uses `model_pipeline.predict(...)` → the pipeline handles preprocessing (encoding, scaling, etc.) and applies the regression model.
    - Returns the predicted score(s).

In [43]:
def categorize_addiction(score):
    if score <= 3:
        return "Low"
    elif score <= 6:
        return "Moderate"
    else:
        return "High"

# USER INPUT SECTION 

def get_user_data():
    print("Please enter the following information to predict the social media addiction score:")
    
    user_data = {}
    try:
        user_data['Age'] = int(input("Enter Age (e.g., 20): "))
        user_data['Gender'] = input("Enter Gender (e.g., Male, Female): ")
        user_data['Academic_Level'] = input("Enter Academic Level (e.g., Undergraduate, High School, Postgraduate): ")
        user_data['Avg_Daily_Usage_Hours'] = float(input("Enter Avg. Daily Usage Hours (e.g., 5.5): "))
        user_data['Most_Used_Platform'] = input("Enter Most Used Platform (e.g., Instagram, Facebook, TikTok): ")
        user_data['Affects_Academic_Performance'] = input("Does social media affect academic performance? (Yes/No): ")
        user_data['Sleep_Hours_Per_Night'] = float(input("Enter Sleep Hours Per Night (e.g., 6.0): "))
        user_data['Mental_Health_Score'] = int(input("Enter Mental Health Score (0-10, e.g., 5): "))
        user_data['Relationship_Status'] = input("Enter Relationship Status (e.g., In Relationship, Single, Complicated): ")
        user_data['Conflicts_Over_Social_Media'] = int(input("Enter Conflicts Over Social Media (0-10, e.g., 2): "))
    except ValueError:
        print("\nInvalid input. Please enter numbers for numeric fields.")
        return None
        
    return [user_data]



- **Purpose**:
    
    Collects **user input interactively** from the console.
    
- **Steps**:
    - Asks the user for all features needed by the model (Age, Gender, Usage Hours, etc.).
    - Converts numeric fields to **int** or **float**.
    - Handles errors if the user types the wrong format (like entering letters in a numeric field).
    - Returns the data as a list containing one dictionary (since ML expects data in row-like format).

In [44]:
# Get data from the user
user_input_data = get_user_data()

if user_input_data:
    predicted_score = predict_addiction_score(user_input_data)
    level = categorize_addiction(predicted_score[0])
    
    print("\n--- Prediction Results ---")
    print(f"Predicted Social Media Addiction Score: {predicted_score[0]:.4f}")
    print(f"Addiction Level: {level}")


Please enter the following information to predict the social media addiction score:


Enter Age (e.g., 20):  20
Enter Gender (e.g., Male, Female):  male
Enter Academic Level (e.g., Undergraduate, High School, Postgraduate):  highschool
Enter Avg. Daily Usage Hours (e.g., 5.5):  3
Enter Most Used Platform (e.g., Instagram, Facebook, TikTok):  facebook
Does social media affect academic performance? (Yes/No):  yes
Enter Sleep Hours Per Night (e.g., 6.0):  3
Enter Mental Health Score (0-10, e.g., 5):  4
Enter Relationship Status (e.g., In Relationship, Single, Complicated):  single
Enter Conflicts Over Social Media (0-10, e.g., 2):  2



--- Prediction Results ---
Predicted Social Media Addiction Score: 7.1551
Addiction Level: High


# **Flow**:

1. Calls `get_user_data()` → collects input.
2. If input is valid, passes it to `predict_addiction_score()`.
3. Gets a prediction (array with one value).
4. Prints the **predicted addiction score** with **4 decimal places**.

### Behavioral and Consequence Features

These features directly show the actions and outcomes of social media use.

- **Avg_Daily_Usage_Hours:** This is the most direct measure of a person's digital behavior. Higher usage is a clear indicator of a strong habit, which can lead to addiction.
- **Sleep_Hours_Per_Night:** This is a major consequence. Less sleep is a common side effect of excessive screen time, showing that social media use is negatively impacting physical health.
- **Conflicts_Over_Social_Media:** These conflicts indicate that social media is straining real-world relationships, a key symptom of addiction.
- **Affects_Academic_Performance:** This feature shows that social media use is severe enough to interfere with important responsibilities, a sign that the user has lost control.

---

### Psychological and Social Features

These features provide context about a person's mental state and social life, which can influence or be influenced by addiction.

- **Mental_Health_Score:** A person's mental state is closely linked to their social media habits. A lower score might indicate someone is using social media to cope, while a higher score might suggest they are more resilient to the negative effects.
- **Age** and **Academic_Level:** These demographic features can help the model identify patterns across different life stages. For example, social media habits often differ between high school and university students.
- **Gender:** Like age and academic level, gender can reveal demographic patterns in social media use and addiction risk.
- **Relationship_Status:** A person's relationship status can influence their social media habits, such as their use of messaging apps or dating platforms, which can play a role in addiction.
- **Most_Used_Platform:** The type of platform a student uses most often matters because different platforms have different features (e.g., visual content, short-form video, messaging) that can be more or less addictive.

## Hyperparameter Tuning for Final Model

In [50]:
# Features and Target

features = df.drop('Addicted_Score', axis=1)
target = df['Addicted_Score']

numeric_features = ['Age', 'Avg_Daily_Usage_Hours', 'Sleep_Hours_Per_Night', 
                    'Mental_Health_Score', 'Conflicts_Over_Social_Media']
categorical_features = ['Gender', 'Academic_Level', 'Most_Used_Platform', 
                        'Relationship_Status', 'Affects_Academic_Performance']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])



**Code Explination**
- **`features`**: All columns except the target (`Addicted_Score`).
- **`target`**: The column we want to predict (`Addicted_Score`).
- **Numeric features**: Columns with continuous or integer values.
- **Categorical features**: Columns with categories that need encoding.
- **ColumnTransformer** preprocesses your data:
- **Numeric** → scaled using `StandardScaler` (centers & normalizes values).
- **Categorical** → one-hot encoded (convert categories into 0/1 columns).

In [51]:
# Gradient Boosting Pipeline
gb_model = GradientBoostingRegressor(random_state=42)

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', gb_model)
])


# Train/Test Split

X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2, random_state=42
)


**Code Explination**
- We **combine preprocessing + regressor** into a single pipeline.
- This ensures new data goes through the same scaling/encoding before prediction.
- **Splits data** into training (80%) and testing (20%) sets.
- Ensures the model is evaluated on unseen data.

In [57]:
model_pipeline.fit(X_train, y_train) 
# Predictions on train set
y_train_pred = model_pipeline.predict(X_train)

# Train metrics
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
r2_train = r2_score(y_train, y_train_pred)

print("\nGradient Boosting (Tuned) Train Results:")
print(f"Train MSE: {mse_train:.4f}")
print(f"Train RMSE: {rmse_train:.4f}")
print(f"Train R² Score: {r2_train:.4f}")

# Predictions on test set
y_test_pred = model_pipeline.predict(X_test)

# Test metrics
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
r2_test = r2_score(y_test, y_test_pred)

print("\nGradient Boosting (Tuned) Test Results:")
print(f"Test MSE: {mse_test:.4f}")
print(f"Test RMSE: {rmse_test:.4f}")
print(f"Test R² Score: {r2_test:.4f}")



Gradient Boosting (Tuned) Train Results:
Train MSE: 0.0237
Train RMSE: 0.1540
Train R² Score: 0.9906

Gradient Boosting (Tuned) Test Results:
Test MSE: 0.0556
Test RMSE: 0.2359
Test R² Score: 0.9778


**Code explination**
- Defines **possible hyperparameters** for tuning Gradient Boosting.
- `regressor__` prefix is required because the model is inside the pipeline.
- Uses **RandomizedSearchCV** with 5-fold cross-validation to find the **best hyperparameters**.
- `scoring='neg_mean_squared_error'` → MSE is minimized.
- Fits the **pipeline on training data** while tuning hyperparameters.
- `best_model` → the pipeline with the best combination of hyperparameters.
- Makes predictions on **test data**.
- Computes:
    - **MSE** → mean squared error
    - **RMSE** → square root of MSE
    - **R² Score** → how well the model explains variance (1.0 is perfect)
    
- Saves the **entire tuned pipeline** for later use.

## Comparision of output- before and after hyperparameter tuining

### 1. **Mean Squared Error (MSE)**

- **Before Tuning:** 0.0556
- **After Tuning:** 0.0320
- **Interpretation:** The MSE decreased after tuning, meaning the average squared difference between predicted and actual addiction scores is smaller. The model’s predictions are closer to the true values.

---

### 2. **Root Mean Squared Error (RMSE)**

- **Before Tuning:** 0.2359
- **After Tuning:** 0.1790
- **Interpretation:** RMSE also decreased, confirming that the prediction errors are smaller in the tuned model. A lower RMSE indicates more precise predictions.

---

### 3. **R² Score**

- **Before Tuning:** 0.9778
- **After Tuning:** 0.9872
- **Interpretation:** The R² score increased after tuning, showing that the tuned model explains more variance in the addicted score. The model fit improved.

---

### 4. **Overall Comparison**

- Hyperparameter tuning improved **all performance metrics**:
    - Lower MSE and RMSE → predictions are more accurate.
    - Higher R² → model explains more variability in the data.
- The tuned model is **more reliable and precise** for predicting social media addiction scores.