In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils import resample
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load the data
movies_data = pd.read_csv('movies_data.csv')

# Handle missing values and make a copy of the DataFrame to avoid chained assignment
movies_data = movies_data.copy()
movies_data.dropna(subset=['Wins', 'Genre'], inplace=True)

# Convert 'Nominations' and 'Wins' columns to numeric, if not already
movies_data['Nominations'] = pd.to_numeric(movies_data['Nominations'], errors='coerce')
movies_data['Wins'] = pd.to_numeric(movies_data['Wins'], errors='coerce')

# Fill missing numeric values with 0
movies_data['Nominations'] = movies_data['Nominations'].fillna(0)
movies_data['Wins'] = movies_data['Wins'].fillna(0)

# Separate majority and minority classes
majority_class = movies_data[movies_data['Wins'] == 0]
minority_class = movies_data[movies_data['Wins'] > 0]

# Oversample the minority class
minority_class_oversampled = resample(minority_class, 
                                      replace=True,      # Sample with replacement
                                      n_samples=len(majority_class), # Match majority class size
                                      random_state=42)   # Seed for reproducibility

# Combine majority class with oversampled minority class
balanced_data = pd.concat([majority_class, minority_class_oversampled])

# Combine training data and new data for LabelEncoder
new_data = pd.DataFrame({
    'Genre': ['Drama', 'Action', 'Comedy', 'Crime', 'Fantasy', 'Adventure', 'Sci-Fi', 'Biography'],
    'Nominations': [5, 8, 3, 6, 4, 7, 9, 2]
})
combined_genres = pd.concat([balanced_data['Genre'], new_data['Genre']], ignore_index=True)

# Ensure 'Genre' column is a string and convert it to numerical values
le = LabelEncoder()
le.fit(combined_genres)
balanced_data['Genre'] = le.transform(balanced_data['Genre'].astype(str))

# Select relevant features for the model
X = balanced_data[['Genre', 'Nominations']]
y = balanced_data['Wins']

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Perform stratified cross-validation with Gradient Boosting
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.05, random_state=42)
cv_scores = cross_val_score(model, X, y, cv=stratified_kfold, scoring='accuracy')

print(f'Cross-Validation Accuracy Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean()}')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train the model with the entire training set
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model with zero_division parameter
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred, zero_division=1)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)

# Preprocess new data using the same steps
new_data['Genre'] = le.transform(new_data['Genre'].astype(str))
X_new = new_data[['Genre', 'Nominations']]
X_new = scaler.transform(X_new)

# Predict the number of wins for new data
new_predictions = model.predict(X_new)

# Display the predictions
for genre, wins in zip(new_data['Genre'], new_predictions):
    original_genre = le.inverse_transform([genre])[0]
    print(f"Predicted Wins for {original_genre}: {wins}")


Cross-Validation Accuracy Scores: [0.94736842 1.         1.         0.91891892 0.91891892]
Mean CV Accuracy: 0.957041251778094
Accuracy: 0.9736842105263158
Confusion Matrix:
[[18  1  0  0  0  0]
 [ 0  4  0  0  0  0]
 [ 0  0  3  0  0  0]
 [ 0  0  0  4  0  0]
 [ 0  0  0  0  3  0]
 [ 0  0  0  0  0  5]]
Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      0.95      0.97        19
         2.0       0.80      1.00      0.89         4
         5.0       1.00      1.00      1.00         3
         6.0       1.00      1.00      1.00         4
         8.0       1.00      1.00      1.00         3
        11.0       1.00      1.00      1.00         5

    accuracy                           0.97        38
   macro avg       0.97      0.99      0.98        38
weighted avg       0.98      0.97      0.97        38

Predicted Wins for Drama: 2.0
Predicted Wins for Action: 8.0
Predicted Wins for Comedy: 2.0
Predicted Wins for Crime: 2.0
Predicted W