In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score


In [2]:
# Load dataset
data = pd.read_csv('twitchdata-update.csv')

data = data.head(50)


In [3]:

# Display the first few rows
print(data.head())

# Handle missing values using mean for numerical columns
numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns
for col in numerical_columns:
    data[col].fillna(data[col].mean(), inplace=True)

# Verify missing values are handled
print(data.isnull().sum())  # Should return 0 for all columns


# Create binary target: High Growth = 1, Low Growth = 0
median_followers = data['Followers gained'].median()
data['Growth_Category'] = np.where(data['Followers gained'] > median_followers, 1, 0)

# Encode the 'Language' column
le = LabelEncoder()
data['Language_encoded'] = le.fit_transform(data['Language'])

# Drop irrelevant columns
X = data[['Watch time(Minutes)', 'Stream time(minutes)', 'Peak viewers', 
          'Average viewers', 'Language_encoded']]
y = data['Growth_Category']


    Channel  Watch time(Minutes)  Stream time(minutes)  Peak viewers  \
0     xQcOW           6196161750                215250        222720   
1  summit1g           6091677300                211845        310998   
2    Gaules           5644590915                515280        387315   
3  ESL_CSGO           3970318140                517740        300575   
4      Tfue           3671000070                123660        285644   

   Average viewers  Followers  Followers gained  Views gained  Partnered  \
0            27716    3246298           1734810      93036735       True   
1            25610    5310163           1370184      89705964       True   
2            10976    1767635           1023779     102611607       True   
3             7714    3944850            703986     106546942       True   
4            29602    8938903           2068424      78998587       True   

   Mature    Language  
0   False     English  
1   False     English  
2    True  Portuguese  
3   False     

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mean(), inplace=True)


In [4]:
# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [5]:
# Train a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:, 1]  # Probabilities for AUC-ROC


In [6]:
# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# AUC-ROC Score
roc_auc = roc_auc_score(y_test, y_prob)
print(f'AUC-ROC Score: {roc_auc:.2f}')


Accuracy: 80.00%
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.75      0.86         8
           1       0.50      1.00      0.67         2

    accuracy                           0.80        10
   macro avg       0.75      0.88      0.76        10
weighted avg       0.90      0.80      0.82        10

AUC-ROC Score: 0.88


In [7]:
import pickle
from sklearn.ensemble import RandomForestClassifier

# Assuming `model` is your trained RandomForest model
with open('model2.pkl', 'wb') as f:
    pickle.dump(clf, f)

