# Level 1

## Task 1 - Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_squared_error, r2_score, roc_curve, auc, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')

### Load the dataset

In [2]:
df = pd.read_csv('Sentiment dataset.csv')

### Initial exploration

In [3]:
df.shape

(732, 15)

In [4]:
df.columns.tolist()

['Unnamed: 0.1',
 'Unnamed: 0',
 'Text',
 'Sentiment',
 'Timestamp',
 'User',
 'Platform',
 'Hashtags',
 'Retweets',
 'Likes',
 'Country',
 'Year',
 'Month',
 'Day',
 'Hour']

In [5]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Text,Sentiment,Timestamp,User,Platform,Hashtags,Retweets,Likes,Country,Year,Month,Day,Hour
0,0,0,Enjoying a beautiful day at the park! ...,Positive,2023-01-15 12:30:00,User123,Twitter,#Nature #Park,15.0,30.0,USA,2023,1,15,12
1,1,1,Traffic was terrible this morning. ...,Negative,2023-01-15 08:45:00,CommuterX,Twitter,#Traffic #Morning,5.0,10.0,Canada,2023,1,15,8
2,2,2,Just finished an amazing workout! 💪 ...,Positive,2023-01-15 15:45:00,FitnessFan,Instagram,#Fitness #Workout,20.0,40.0,USA,2023,1,15,15
3,3,3,Excited about the upcoming weekend getaway! ...,Positive,2023-01-15 18:20:00,AdventureX,Facebook,#Travel #Adventure,8.0,15.0,UK,2023,1,15,18
4,4,4,Trying out a new recipe for dinner tonight. ...,Neutral,2023-01-15 19:55:00,ChefCook,Instagram,#Cooking #Food,12.0,25.0,Australia,2023,1,15,19


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 732 entries, 0 to 731
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0.1  732 non-null    int64  
 1   Unnamed: 0    732 non-null    int64  
 2   Text          732 non-null    object 
 3   Sentiment     732 non-null    object 
 4   Timestamp     732 non-null    object 
 5   User          732 non-null    object 
 6   Platform      732 non-null    object 
 7   Hashtags      732 non-null    object 
 8   Retweets      732 non-null    float64
 9   Likes         732 non-null    float64
 10  Country       732 non-null    object 
 11  Year          732 non-null    int64  
 12  Month         732 non-null    int64  
 13  Day           732 non-null    int64  
 14  Hour          732 non-null    int64  
dtypes: float64(2), int64(6), object(7)
memory usage: 85.9+ KB


In [7]:
df.isnull().sum()

Unnamed: 0.1    0
Unnamed: 0      0
Text            0
Sentiment       0
Timestamp       0
User            0
Platform        0
Hashtags        0
Retweets        0
Likes           0
Country         0
Year            0
Month           0
Day             0
Hour            0
dtype: int64

In [8]:
df['Sentiment'].unique()

array([' Positive  ', ' Negative  ', ' Neutral   ', ' Anger        ',
       ' Fear         ', ' Sadness      ', ' Disgust      ',
       ' Happiness    ', ' Joy          ', ' Love         ',
       ' Amusement    ', ' Enjoyment    ', ' Admiration   ',
       ' Affection    ', ' Awe          ', ' Disappointed ',
       ' Surprise     ', ' Acceptance   ', ' Adoration    ',
       ' Anticipation ', ' Bitter       ', ' Calmness     ',
       ' Confusion    ', ' Excitement   ', ' Kind         ',
       ' Pride        ', ' Shame        ', ' Confusion ', ' Excitement ',
       ' Shame ', ' Elation       ', ' Euphoria      ', ' Contentment   ',
       ' Serenity      ', ' Gratitude     ', ' Hope          ',
       ' Empowerment   ', ' Compassion    ', ' Tenderness    ',
       ' Arousal       ', ' Enthusiasm    ', ' Fulfillment  ',
       ' Reverence     ', ' Compassion', ' Fulfillment   ', ' Reverence ',
       ' Elation   ', ' Despair         ', ' Grief           ',
       ' Loneliness     

In [9]:
df['Sentiment'].value_counts()

Sentiment
Positive               44
Joy                    42
Excitement             32
Happy                  14
Neutral                14
                       ..
Vibrancy                1
Culinary Adventure      1
Mesmerizing             1
Thrilling Journey       1
Winter Magic            1
Name: count, Length: 279, dtype: int64

### Clean the data

In [10]:
df_clean = df.copy()
df_clean

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Text,Sentiment,Timestamp,User,Platform,Hashtags,Retweets,Likes,Country,Year,Month,Day,Hour
0,0,0,Enjoying a beautiful day at the park! ...,Positive,2023-01-15 12:30:00,User123,Twitter,#Nature #Park,15.0,30.0,USA,2023,1,15,12
1,1,1,Traffic was terrible this morning. ...,Negative,2023-01-15 08:45:00,CommuterX,Twitter,#Traffic #Morning,5.0,10.0,Canada,2023,1,15,8
2,2,2,Just finished an amazing workout! 💪 ...,Positive,2023-01-15 15:45:00,FitnessFan,Instagram,#Fitness #Workout,20.0,40.0,USA,2023,1,15,15
3,3,3,Excited about the upcoming weekend getaway! ...,Positive,2023-01-15 18:20:00,AdventureX,Facebook,#Travel #Adventure,8.0,15.0,UK,2023,1,15,18
4,4,4,Trying out a new recipe for dinner tonight. ...,Neutral,2023-01-15 19:55:00,ChefCook,Instagram,#Cooking #Food,12.0,25.0,Australia,2023,1,15,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
727,728,732,Collaborating on a science project that receiv...,Happy,2017-08-18 18:20:00,ScienceProjectSuccessHighSchool,Facebook,#ScienceFairWinner #HighSchoolScience,20.0,39.0,UK,2017,8,18,18
728,729,733,Attending a surprise birthday party organized ...,Happy,2018-06-22 14:15:00,BirthdayPartyJoyHighSchool,Instagram,#SurpriseCelebration #HighSchoolFriendship,25.0,48.0,USA,2018,6,22,14
729,730,734,Successfully fundraising for a school charity ...,Happy,2019-04-05 17:30:00,CharityFundraisingTriumphHighSchool,Twitter,#CommunityGiving #HighSchoolPhilanthropy,22.0,42.0,Canada,2019,4,5,17
730,731,735,"Participating in a multicultural festival, cel...",Happy,2020-02-29 20:45:00,MulticulturalFestivalJoyHighSchool,Facebook,#CulturalCelebration #HighSchoolUnity,21.0,43.0,UK,2020,2,29,20


### Remove unnecessary columns

In [11]:
df_clean = df_clean.drop(['Unnamed: 0.1', 'Unnamed: 0'], axis=1)
df_clean

Unnamed: 0,Text,Sentiment,Timestamp,User,Platform,Hashtags,Retweets,Likes,Country,Year,Month,Day,Hour
0,Enjoying a beautiful day at the park! ...,Positive,2023-01-15 12:30:00,User123,Twitter,#Nature #Park,15.0,30.0,USA,2023,1,15,12
1,Traffic was terrible this morning. ...,Negative,2023-01-15 08:45:00,CommuterX,Twitter,#Traffic #Morning,5.0,10.0,Canada,2023,1,15,8
2,Just finished an amazing workout! 💪 ...,Positive,2023-01-15 15:45:00,FitnessFan,Instagram,#Fitness #Workout,20.0,40.0,USA,2023,1,15,15
3,Excited about the upcoming weekend getaway! ...,Positive,2023-01-15 18:20:00,AdventureX,Facebook,#Travel #Adventure,8.0,15.0,UK,2023,1,15,18
4,Trying out a new recipe for dinner tonight. ...,Neutral,2023-01-15 19:55:00,ChefCook,Instagram,#Cooking #Food,12.0,25.0,Australia,2023,1,15,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...
727,Collaborating on a science project that receiv...,Happy,2017-08-18 18:20:00,ScienceProjectSuccessHighSchool,Facebook,#ScienceFairWinner #HighSchoolScience,20.0,39.0,UK,2017,8,18,18
728,Attending a surprise birthday party organized ...,Happy,2018-06-22 14:15:00,BirthdayPartyJoyHighSchool,Instagram,#SurpriseCelebration #HighSchoolFriendship,25.0,48.0,USA,2018,6,22,14
729,Successfully fundraising for a school charity ...,Happy,2019-04-05 17:30:00,CharityFundraisingTriumphHighSchool,Twitter,#CommunityGiving #HighSchoolPhilanthropy,22.0,42.0,Canada,2019,4,5,17
730,"Participating in a multicultural festival, cel...",Happy,2020-02-29 20:45:00,MulticulturalFestivalJoyHighSchool,Facebook,#CulturalCelebration #HighSchoolUnity,21.0,43.0,UK,2020,2,29,20


### Clean sentiment labels (remove extra spaces)

In [12]:
df_clean['Sentiment'] = df_clean['Sentiment'].str.strip()
df_clean

Unnamed: 0,Text,Sentiment,Timestamp,User,Platform,Hashtags,Retweets,Likes,Country,Year,Month,Day,Hour
0,Enjoying a beautiful day at the park! ...,Positive,2023-01-15 12:30:00,User123,Twitter,#Nature #Park,15.0,30.0,USA,2023,1,15,12
1,Traffic was terrible this morning. ...,Negative,2023-01-15 08:45:00,CommuterX,Twitter,#Traffic #Morning,5.0,10.0,Canada,2023,1,15,8
2,Just finished an amazing workout! 💪 ...,Positive,2023-01-15 15:45:00,FitnessFan,Instagram,#Fitness #Workout,20.0,40.0,USA,2023,1,15,15
3,Excited about the upcoming weekend getaway! ...,Positive,2023-01-15 18:20:00,AdventureX,Facebook,#Travel #Adventure,8.0,15.0,UK,2023,1,15,18
4,Trying out a new recipe for dinner tonight. ...,Neutral,2023-01-15 19:55:00,ChefCook,Instagram,#Cooking #Food,12.0,25.0,Australia,2023,1,15,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...
727,Collaborating on a science project that receiv...,Happy,2017-08-18 18:20:00,ScienceProjectSuccessHighSchool,Facebook,#ScienceFairWinner #HighSchoolScience,20.0,39.0,UK,2017,8,18,18
728,Attending a surprise birthday party organized ...,Happy,2018-06-22 14:15:00,BirthdayPartyJoyHighSchool,Instagram,#SurpriseCelebration #HighSchoolFriendship,25.0,48.0,USA,2018,6,22,14
729,Successfully fundraising for a school charity ...,Happy,2019-04-05 17:30:00,CharityFundraisingTriumphHighSchool,Twitter,#CommunityGiving #HighSchoolPhilanthropy,22.0,42.0,Canada,2019,4,5,17
730,"Participating in a multicultural festival, cel...",Happy,2020-02-29 20:45:00,MulticulturalFestivalJoyHighSchool,Facebook,#CulturalCelebration #HighSchoolUnity,21.0,43.0,UK,2020,2,29,20


### Convert timestamp to datetime

In [13]:
df_clean['Timestamp'] = pd.to_datetime(df_clean['Timestamp'])

### Create additional features

In [14]:
df_clean['Text_Length'] = df_clean['Text'].str.len()
df_clean['Hashtag_Count'] = df_clean['Hashtags'].str.count('#')

In [15]:
df_clean.shape

(732, 15)

In [16]:
df_clean['Sentiment'].nunique()

191

In [17]:
df_clean['Sentiment'].value_counts().head(15)

Sentiment
Positive       45
Joy            44
Excitement     37
Contentment    19
Neutral        18
Gratitude      18
Curiosity      16
Serenity       15
Happy          14
Despair        11
Nostalgia      11
Hopeful         9
Loneliness      9
Awe             9
Grief           9
Name: count, dtype: int64

### Handle missing values

In [18]:
df_clean.isnull().sum().sum()

np.int64(0)

In [19]:
le_platform = LabelEncoder()
le_country = LabelEncoder()
le_sentiment = LabelEncoder()

In [20]:
df_clean['Platform_Encoded'] = le_platform.fit_transform(df_clean['Platform'])
df_clean['Country_Encoded'] = le_country.fit_transform(df_clean['Country'])
df_clean['Sentiment_Encoded'] = le_sentiment.fit_transform(df_clean['Sentiment'])

### Create binary sentiment (Positive vs Non-Positive for binary classification later)

In [21]:
positive_sentiments = ['Positive', 'Joy', 'Happiness', 'Excitement', 'Gratitude', 'Love', 'Happy']
df_clean['Binary_Sentiment'] = df_clean['Sentiment'].apply(lambda x: 1 if x in positive_sentiments else 0)

In [22]:
df_clean['Binary_Sentiment'].value_counts()

Binary_Sentiment
0    566
1    166
Name: count, dtype: int64

### Normalize numerical features

In [23]:
scaler = StandardScaler()
numerical_features = ['Retweets', 'Likes', 'Year', 'Month', 'Day', 'Hour', 'Text_Length', 'Hashtag_Count']

In [24]:
df_scaled = df_clean.copy()
df_scaled[numerical_features] = scaler.fit_transform(df_clean[numerical_features])
numerical_features

['Retweets',
 'Likes',
 'Year',
 'Month',
 'Day',
 'Hour',
 'Text_Length',
 'Hashtag_Count']

### Prepare features for machine learning

In [25]:
features_numerical = ['Platform_Encoded', 'Country_Encoded', 'Year', 'Month', 'Day', 'Hour', 'Text_Length', 'Hashtag_Count']
target_continuous = 'Likes'  # For regression
target_categorical = 'Sentiment_Encoded'  # For classification
target_binary = 'Binary_Sentiment'  # For binary classification

### Split data

In [26]:
X = df_clean[features_numerical]
y_continuous = df_clean[target_continuous]
y_categorical = df_clean[target_categorical]
y_binary = df_clean[target_binary]

In [27]:
X_train, X_test, y_cont_train, y_cont_test = train_test_split(X, y_continuous, test_size=0.3, random_state=42)
_, _, y_cat_train, y_cat_test = train_test_split(X, y_categorical, test_size=0.3, random_state=42)
_, _, y_bin_train, y_bin_test = train_test_split(X, y_binary, test_size=0.3, random_state=42)

In [28]:
X_train.shape

(512, 8)

In [29]:
X_test.shape

(220, 8)

## Task 2 - Linear Regression

### Train linear regression model to predict Likes

In [30]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_cont_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


### Make predictions

In [31]:
y_pred_lr = lr_model.predict(X_test)

### Evaluate the model

In [32]:
r2 = r2_score(y_cont_test, y_pred_lr)
r2

0.050277288684627996

In [33]:
mse = mean_squared_error(y_cont_test, y_pred_lr)
mse

200.19626912773197

In [34]:
rmse = np.sqrt(mse)
rmse

np.float64(14.149073083694633)

### Interpret coefficients

In [35]:
feature_names = X_train.columns
coefficients = lr_model.coef_
intercept = lr_model.intercept_

In [36]:
for feature, coef in zip(feature_names, coefficients):
    print(f"{feature}: {coef:.4f}")

Platform_Encoded: -0.5062
Country_Encoded: 0.0145
Year: -0.2604
Month: -0.0611
Day: 0.0088
Hour: 0.6137
Text_Length: 0.0736
Hashtag_Count: 0.0000


## Task 3 - KNN Classification

### Test different K values

In [37]:
k_values = [3, 5, 7, 9, 11]
knn_results = {}

for k in k_values:
    # Train KNN model
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(X_train, y_bin_train)
    
    # Make predictions
    y_pred_knn = knn_model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_bin_test, y_pred_knn)
    f1 = f1_score(y_bin_test, y_pred_knn)
    
    # Confusion matrix
    cm = confusion_matrix(y_bin_test, y_pred_knn)
    
    # Precision and recall from classification report
    report = classification_report(y_bin_test, y_pred_knn, output_dict=True)
    precision = report['weighted avg']['precision']
    recall = report['weighted avg']['recall']
    
    knn_results[k] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'confusion_matrix': cm
    }
    
    print(f"--------------- KNN Results for K={k}: ---------------")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"Confusion Matrix:\n{cm}")
    print("\n")

--------------- KNN Results for K=3: ---------------
Accuracy: 0.7773
Precision: 0.7691
Recall: 0.7773
F1-Score: 0.4615
Confusion Matrix:
[[150  22]
 [ 27  21]]


--------------- KNN Results for K=5: ---------------
Accuracy: 0.8182
Precision: 0.8031
Recall: 0.8182
F1-Score: 0.5122
Confusion Matrix:
[[159  13]
 [ 27  21]]


--------------- KNN Results for K=7: ---------------
Accuracy: 0.7909
Precision: 0.7643
Recall: 0.7909
F1-Score: 0.3947
Confusion Matrix:
[[159  13]
 [ 33  15]]


--------------- KNN Results for K=9: ---------------
Accuracy: 0.7955
Precision: 0.7651
Recall: 0.7955
F1-Score: 0.3478
Confusion Matrix:
[[163   9]
 [ 36  12]]


--------------- KNN Results for K=11: ---------------
Accuracy: 0.7818
Precision: 0.7400
Recall: 0.7818
F1-Score: 0.2727
Confusion Matrix:
[[163   9]
 [ 39   9]]




### Find best K

In [38]:
best_k = max(knn_results.keys(), key=lambda k: knn_results[k]['accuracy'])
best_k

5

In [39]:
knn_results[best_k]['accuracy']

0.8181818181818182

# Level - 2

## Task 1 - Logisitic Regrerssion

### Train logistic regression model for binary classification

In [40]:
log_reg_model = LogisticRegression(random_state=42, max_iter=1000)
log_reg_model.fit(X_train, y_bin_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,1000


### Make predictions

In [41]:
y_pred_log_reg = log_reg_model.predict(X_test)
y_pred_proba_log_reg = log_reg_model.predict_proba(X_test)[:, 1]

### Calculate metrics

In [42]:
accuracy_log_reg = accuracy_score(y_bin_test, y_pred_log_reg)
accuracy_log_reg

0.7863636363636364

In [43]:
precision_log_reg = classification_report(y_bin_test, y_pred_log_reg, output_dict=True)['weighted avg']['precision']
precision_log_reg

0.8322125363221253

In [44]:
recall_log_reg = classification_report(y_bin_test, y_pred_log_reg, output_dict=True)['weighted avg']['recall']
recall_log_reg

0.7863636363636364

In [45]:
f1_log_reg = f1_score(y_bin_test, y_pred_log_reg)
f1_log_reg

0.04081632653061224

### ROC curve

In [46]:
fpr, tpr, thresholds = roc_curve(y_bin_test, y_pred_proba_log_reg)
roc_auc = auc(fpr, tpr)

### Interpret coefficients and odds ratios

In [47]:
coefficients_log_reg = log_reg_model.coef_[0]
odds_ratios = np.exp(coefficients_log_reg)

In [48]:
for feature, coef, odds_ratio in zip(feature_names, coefficients_log_reg, odds_ratios):
    print(f"{feature}: Coef={coef:.4f}, Odds Ratio={odds_ratio:.4f}")

Platform_Encoded: Coef=-0.0505, Odds Ratio=0.9508
Country_Encoded: Coef=0.0009, Odds Ratio=1.0009
Year: Coef=0.3579, Odds Ratio=1.4303
Month: Coef=0.0037, Odds Ratio=1.0037
Day: Coef=0.0208, Odds Ratio=1.0210
Hour: Coef=0.0487, Odds Ratio=1.0499
Text_Length: Coef=-0.0036, Odds Ratio=0.9964
Hashtag_Count: Coef=-0.0002, Odds Ratio=0.9998


## Task 2 - Decision Tree

### Train decision tree model

In [49]:
dt_model = DecisionTreeClassifier(random_state=42, max_depth=5)
dt_model.fit(X_train, y_bin_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


### Make predictions

In [50]:
y_pred_dt = dt_model.predict(X_test)

### Calculate metrics

In [51]:
accuracy_dt = accuracy_score(y_bin_test, y_pred_dt)
accuracy_dt

0.8272727272727273

In [52]:
f1_dt = f1_score(y_bin_test, y_pred_dt)
f1_dt

0.4722222222222222

In [53]:
precision_dt = classification_report(y_bin_test, y_pred_dt, output_dict=True)['weighted avg']['precision']
precision_dt

0.812708719851577

In [54]:
recall_dt = classification_report(y_bin_test, y_pred_dt, output_dict=True)['weighted avg']['recall']
recall_dt

0.8272727272727273

### Feature importance

In [55]:
feature_importance = dt_model.feature_importances_
print(f"Feature Importance:")
for feature, importance in zip(feature_names, feature_importance):
    print(f"{feature}: {importance:.4f}")

Feature Importance:
Platform_Encoded: 0.0194
Country_Encoded: 0.0542
Year: 0.4097
Month: 0.2423
Day: 0.0399
Hour: 0.0526
Text_Length: 0.1818
Hashtag_Count: 0.0000


### Try pruning (different max_depth values)

In [56]:
max_depths = [3, 5, 7, 10, None]
pruning_results = {}

for depth in max_depths:
    dt_pruned = DecisionTreeClassifier(random_state=42, max_depth=depth)
    dt_pruned.fit(X_train, y_bin_train)
    y_pred_pruned = dt_pruned.predict(X_test)
    accuracy_pruned = accuracy_score(y_bin_test, y_pred_pruned)
    pruning_results[depth] = accuracy_pruned
    print(f"Max Depth {depth}: Accuracy = {accuracy_pruned:.4f}")

Max Depth 3: Accuracy = 0.8091
Max Depth 5: Accuracy = 0.8273
Max Depth 7: Accuracy = 0.8227
Max Depth 10: Accuracy = 0.8182
Max Depth None: Accuracy = 0.8182


In [57]:
best_depth = max(pruning_results.keys(), key=lambda d: pruning_results[d])
best_depth

5

In [58]:
pruning_results[best_depth]

0.8272727272727273

## Task 3 - K-Means Clustering

### Prepare data for clustering (using unlabeled approach)

In [59]:
X_cluster = X_train.copy()

### Use elbow method to find optimal k

In [60]:
inertias = []
k_range = range(2, 11)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_cluster)
    inertias.append(kmeans.inertia_)

for k, inertia in zip(k_range, inertias):
    print(f"K={k}: Inertia={inertia:.2f}")

K=2: Inertia=606121.36
K=3: Inertia=429816.79
K=4: Inertia=279266.21
K=5: Inertia=232314.19
K=6: Inertia=188052.21
K=7: Inertia=162875.70
K=8: Inertia=141717.59
K=9: Inertia=131882.76
K=10: Inertia=123088.65


### Choose optimal k (let's use k=4 based on typical elbow method)

In [61]:
optimal_k = 4
kmeans_model = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
cluster_labels = kmeans_model.fit_predict(X_cluster)
kmeans_model.cluster_centers_

array([[1.01481481e+00, 9.15407407e+01, 2.02094815e+03, 7.03703704e+00,
        1.54444444e+01, 1.68666667e+01, 1.15340741e+02, 2.00000000e+00],
       [1.35211268e+00, 2.75211268e+01, 2.01971831e+03, 5.74647887e+00,
        1.67112676e+01, 1.48873239e+01, 6.50352113e+01, 2.00000000e+00],
       [1.12500000e+00, 2.03854167e+01, 2.02114583e+03, 6.70833333e+00,
        1.44270833e+01, 1.52083333e+01, 1.16979167e+02, 2.00000000e+00],
       [1.15107914e+00, 9.65323741e+01, 2.02035252e+03, 5.63309353e+00,
        1.54604317e+01, 1.48848921e+01, 6.59784173e+01, 2.00000000e+00]])

### Analyze clusters

In [62]:
unique_labels, counts = np.unique(cluster_labels, return_counts=True)
print(f"Cluster distribution:")
for label, count in zip(unique_labels, counts):
    print(f"Cluster {label}: {count} points")

Cluster distribution:
Cluster 0: 135 points
Cluster 1: 142 points
Cluster 2: 96 points
Cluster 3: 139 points


# Level - 3

## Task 1 - Random Forest Classifier

### Train Random Forest with hyperparameter tuning

In [63]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 5, 10]
}

In [64]:
rf_model = RandomForestClassifier(random_state=42)

In [65]:
grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

In [66]:
grid_search.fit(X_train, y_bin_train)

0,1,2
,estimator,RandomForestC...ndom_state=42)
,param_grid,"{'max_depth': [3, 5, ...], 'min_samples_split': [2, 5, ...], 'n_estimators': [50, 100, ...]}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,5
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


### Best model

In [67]:
best_rf_model = grid_search.best_estimator_
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_}")

Best parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}
Best cross-validation score: 0.8377879307062631


### Make predictions

In [68]:
y_pred_rf = best_rf_model.predict(X_test)

### Calculate metrics

In [69]:
accuracy_rf = accuracy_score(y_bin_test, y_pred_rf)
accuracy_rf

0.8318181818181818

In [70]:
f1_rf = f1_score(y_bin_test, y_pred_rf)
f1_rf

0.4931506849315068

In [71]:
precision_rf = classification_report(y_bin_test, y_pred_rf, output_dict=True)['weighted avg']['precision']
precision_rf

0.8186293706293707

In [72]:
recall_rf = classification_report(y_bin_test, y_pred_rf, output_dict=True)['weighted avg']['recall']
recall_rf

0.8318181818181818

### Feature importance

In [73]:
feature_importance_rf = best_rf_model.feature_importances_
print(f"Feature Importance (Random Forest):")
for feature, importance in sorted(zip(feature_names, feature_importance_rf), key=lambda x: x[1], reverse=True):
    print(f"{feature}: {importance:.4f}")

Feature Importance (Random Forest):
Year: 0.2392
Text_Length: 0.1798
Month: 0.1590
Country_Encoded: 0.1347
Day: 0.1327
Hour: 0.1045
Platform_Encoded: 0.0500
Hashtag_Count: 0.0000


### Cross-validation scores

In [74]:
cv_scores = cross_val_score(best_rf_model, X_train, y_bin_train, cv=5)
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean CV score: {cv_scores.mean()} (+/- {cv_scores.std() * 2})")

Cross-validation scores: [0.83495146 0.89320388 0.83333333 0.81372549 0.81372549]
Mean CV score: 0.8377879307062631 (+/- 0.058356277707638134)


## Task 2 - SVM for Classification

### Test different kernels

In [75]:
kernels = ['linear', 'rbf', 'poly']
svm_results = {}

for kernel in kernels:
    print(f"--------------- Testing SVM with {kernel} kernel ---------------")
    
    # Train SVM model
    svm_model = SVC(kernel=kernel, random_state=42, probability=True)
    svm_model.fit(X_train, y_bin_train)
    
    # Make predictions
    y_pred_svm = svm_model.predict(X_test)
    y_pred_proba_svm = svm_model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    accuracy_svm = accuracy_score(y_bin_test, y_pred_svm)
    precision_svm = classification_report(y_bin_test, y_pred_svm, output_dict=True)['weighted avg']['precision']
    recall_svm = classification_report(y_bin_test, y_pred_svm, output_dict=True)['weighted avg']['recall']
    f1_svm = f1_score(y_bin_test, y_pred_svm)
    
    # AUC
    fpr_svm, tpr_svm, _ = roc_curve(y_bin_test, y_pred_proba_svm)
    auc_svm = auc(fpr_svm, tpr_svm)
    
    svm_results[kernel] = {
        'accuracy': accuracy_svm,
        'precision': precision_svm,
        'recall': recall_svm,
        'f1_score': f1_svm,
        'auc': auc_svm,
        'fpr': fpr_svm,
        'tpr': tpr_svm
    }
    
    print(f"Accuracy: {accuracy_svm:.4f}")
    print(f"Precision: {precision_svm:.4f}")
    print(f"Recall: {recall_svm:.4f}")
    print(f"F1-Score: {f1_svm:.4f}")
    print(f"AUC: {auc_svm:.4f}")
    print("\n")

--------------- Testing SVM with linear kernel ---------------
Accuracy: 0.7818
Precision: 0.6112
Recall: 0.7818
F1-Score: 0.0000
AUC: 0.6331


--------------- Testing SVM with rbf kernel ---------------
Accuracy: 0.7818
Precision: 0.6112
Recall: 0.7818
F1-Score: 0.0000
AUC: 0.6600


--------------- Testing SVM with poly kernel ---------------
Accuracy: 0.7818
Precision: 0.6112
Recall: 0.7818
F1-Score: 0.0000
AUC: 0.6294




### Find best kernel

In [76]:
best_kernel = max(svm_results.keys(), key=lambda k: svm_results[k]['auc'])
best_kernel

'rbf'

In [77]:
svm_results[best_kernel]['auc']

0.6600048449612403

## Task 3 - Neural Networks with Tensorflow/Keras

### Prepare data for neural network

In [78]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import StandardScaler

In [79]:
scaler_nn = StandardScaler()
X_train_scaled = scaler_nn.fit_transform(X_train)
X_test_scaled = scaler_nn.transform(X_test)

### Build a simple feedforward neural network

In [80]:
model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    layers.Dropout(0.3),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(16, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

### Compile the model

In [81]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

### Train the model

In [82]:
history = model.fit(X_train_scaled, y_bin_train, epochs=50, batch_size=32, validation_split=0.2, verbose=0)

### Evaluate the model

In [83]:
test_loss, test_accuracy = model.evaluate(X_test_scaled, y_bin_test, verbose=0)

### Make predictions

In [84]:
y_pred_nn = model.predict(X_test_scaled)
y_pred_nn_binary = (y_pred_nn > 0.5).astype(int).flatten()

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step


### Calculate metrics

In [85]:
accuracy_nn = accuracy_score(y_bin_test, y_pred_nn_binary)
accuracy_nn

0.7818181818181819

In [86]:
f1_nn = f1_score(y_bin_test, y_pred_nn_binary)
f1_nn

0.35135135135135137

In [87]:
precision_nn = classification_report(y_bin_test, y_pred_nn_binary, output_dict=True)['weighted avg']['precision']
precision_nn

0.7498594189315838

In [88]:
recall_nn = classification_report(y_bin_test, y_pred_nn_binary, output_dict=True)['weighted avg']['recall']
recall_nn

0.7818181818181819