In [1]:
!pip install cem



In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler

# Function to convert duration from '1h 32m' to minutes
def convert_duration_improved(duration):
    if not isinstance(duration, str):
        return np.nan  # Return NaN for non-string values
    hours = 0
    minutes = 0
    parts = duration.split(' ')
    for part in parts:
        if 'h' in part:
            try:
                hours = int(part.replace('h', ''))
            except ValueError:
                hours = 0
        elif 'm' in part:
            try:
                minutes = int(part.replace('m', ''))
            except ValueError:
                minutes = 0
    return hours * 60 + minutes

# Read the data
file_path = 'Merged_diversity_movies.csv'
df = pd.read_csv(file_path)

# Preprocessing
df['duration'] = df['duration'].apply(convert_duration_improved)
scaler = MinMaxScaler()
numerical_columns = ['tomatometer', 'audience_score', 'weighted_score', 'duration', 'year']
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
df['genres'] = df['genres'].apply(lambda x: x.strip("[]").replace("'", "").split(', '))
df_exploded = df.explode('genres')
df_exploded = pd.get_dummies(df_exploded, columns=['genres', 'rating'])

df.head()

# Feature selection and splitting data
features = df_exploded.drop(columns=['name', 'actors', 'directors', 'Movie', 'Production Budget', 'Domestic Gross', 'WorldwideGross', 'Release Date', 'diversity_score', 'index'])
target = df_exploded['diversity_score']
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Training the model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Displaying feature importance
feature_importance = model.feature_importances_
importance_df = pd.DataFrame({'Feature': features.columns, 'Importance': feature_importance}).sort_values(by='Importance', ascending=False)
importance_df.head()  # Display the top 5 important features


Unnamed: 0,Feature,Importance
3,duration,0.214029
1,audience_score,0.175617
0,tomatometer,0.16222
2,weighted_score,0.15432
4,year,0.146937


In [1]:
# !pip install pymatch

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from pymatch.Matcher import Matcher

def convert_duration_improved(duration):
    if not isinstance(duration, str):
        return np.nan  # Return NaN for non-string values
    hours = 0
    minutes = 0
    parts = duration.split(' ')
    for part in parts:
        if 'h' in part:
            try:
                hours = int(part.replace('h', ''))
            except ValueError:
                hours = 0
        elif 'm' in part:
            try:
                minutes = int(part.replace('m', ''))
            except ValueError:
                minutes = 0
    return hours * 60 + minutes

# Read the data
file_path = 'Merged_diversity_movies_isDiverse.csv'
df = pd.read_csv(file_path)

def cleanCurrency(x):
    if isinstance(x, str):
        return x.replace('$', '').replace(',', '')
    return x

# Preprocessing
df['duration'] = df['duration'].apply(convert_duration_improved)
scaler = MinMaxScaler()
numerical_columns = ['tomatometer', 'audience_score', 'weighted_score', 'duration', 'year']
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
df['genres'] = df['genres'].apply(lambda x: x.strip("[]").replace("'", "").split(', '))
df_exploded = df.explode('genres')
df_exploded = pd.get_dummies(df_exploded, columns=['genres', 'rating'])
list_of_categorical_vars = ['genres_Action', 'genres_Adventure',
       'genres_Animation', 'genres_Biography', 'genres_Comedy', 'genres_Crime',
       'genres_Documentary', 'genres_Drama', 'genres_Fantasy',
       'genres_History', 'genres_Holiday', 'genres_Horror',
       'genres_Kids & Family', 'genres_LGBTQ+', 'genres_Music',
       'genres_Musical', 'genres_Mystery & Thriller', 'genres_Romance',
       'genres_Sci-Fi', 'genres_War', 'genres_Western', 'rating_G',
       'rating_NC-17', 'rating_PG', 'rating_PG-13', 'rating_R', 'rating_TVG',
       'rating_TVMA', 'Release Date', 'Domestic Gross', 'Production Budget']
# list_of_categorical_vars = [col for col in df_exploded.columns if 'genres_' in col or 'rating_' in col]
# 
print(list_of_categorical_vars)

# Feature selection and splitting data
features = df_exploded.drop(columns=['name', 'actors', 'directors', 'Movie', 'Production Budget', 'Domestic Gross', 'WorldwideGross', 'Release Date', 'diversity_score', 'index'])
target = df_exploded['isDiverse']
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# # Training the model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# # CEM
# matcher = Matcher(X_train, X_test, y_train, y_test, list_of_categorical_vars)

#isDiverse is binary variable treatment and control
treatment = df_exploded[df_exploded['isDiverse'] == 1]
control = df_exploded[df_exploded['isDiverse'] == 0]

# Remove '$' and ',' before converting to int
# treatment['Domestic Gross'] = treatment['Domestic Gross'].replace('[\$,]', '', regex=True).astype(int)
# control['Domestic Gross'] = control['Domestic Gross'].replace('[\$,]', '', regex=True).astype(int)

matcher = Matcher(treatment, control, yvar="isDiverse", exclude=list_of_categorical_vars)
np.random.seed(20175)
matcher.fit_scores(balance=True, nmodels=10)







['genres_Action', 'genres_Adventure', 'genres_Animation', 'genres_Biography', 'genres_Comedy', 'genres_Crime', 'genres_Documentary', 'genres_Drama', 'genres_Fantasy', 'genres_History', 'genres_Holiday', 'genres_Horror', 'genres_Kids & Family', 'genres_LGBTQ+', 'genres_Music', 'genres_Musical', 'genres_Mystery & Thriller', 'genres_Romance', 'genres_Sci-Fi', 'genres_War', 'genres_Western', 'rating_G', 'rating_NC-17', 'rating_PG', 'rating_PG-13', 'rating_R', 'rating_TVG', 'rating_TVMA', 'Release Date', 'Domestic Gross', 'Production Budget']
Formula:
isDiverse ~ name+tomatometer+audience_score+weighted_score+duration+year+actors+directors+index+Movie+WorldwideGross+diversity_score
n majority: 2795
n minority: 2657
Fitting Models on Balanced Samples: 1\10

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Error: SVD did not converge in Linear Least Squares
Fitting Models on Balanced Samples: 1\10

KeyboardInterrupt: 

In [39]:
from pymatch.Matcher import Matcher

# Additional preprocessing as per user's code
df['duration'] = df['duration'].apply(convert_duration_improved)
scaler = MinMaxScaler()
numerical_columns = ['tomatometer', 'audience_score', 'weighted_score', 'duration', 'year']
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
df['genres'] = df['genres'].apply(lambda x: x.strip("[]").replace("'", "").split(', '))
df_exploded = df.explode('genres')
df_exploded = pd.get_dummies(df_exploded, columns=['genres', 'rating'])

# Defining the list of categorical variables
list_of_categorical_vars = [col for col in df_exploded.columns if 'genres_' in col or 'rating_' in col]

# Feature selection and splitting data
features = df_exploded.drop(columns=['name', 'actors', 'directors', 'Movie', 'Production Budget', 'Domestic Gross', 'WorldwideGross', 'Release Date', 'diversity_score', 'index'])
target = df_exploded['isDiverse']
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Initializing the Matcher
matcher = Matcher(X_train, X_test, y_train, y_test, list_of_categorical_vars, replace=True)
np.random.seed(20170925)
matcher.fit_scores(balance=True, nmodels=100)
matcher.predict_scores()
matcher.evaluate_perf()

# Results of the evaluation
matcher_performace = matcher.perf_df
matcher_performace.head()  # Display the performance of the matcher


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


AttributeError: 'list' object has no attribute 'strip'

In [37]:
file_path = 'Merged_diversity_movies.csv'
df = pd.read_csv(file_path)

# create a isDiverse column with 1 if diversity_score > the median of diversity_score, 0 otherwise
df['isDiverse'] = df['diversity_score'].apply(lambda x: 1 if x > df['diversity_score'].median() else 0)

# save it to a csv file
df.to_csv('Merged_diversity_movies_isDiverse.csv', index=False)


0    1161
1    1161
Name: isDiverse, dtype: int64
0.2141836682293353
