In [1]:
import pandas as pd
import ast
import seaborn as sns
import requests
import matplotlib.pyplot as plt
import numpy as np


In [2]:
bec_results = pd.read_csv('cast_bechdel_results.csv')

In [9]:
bec_results['release_date']

0       1919
1       1920
2       1921
3       1921
4       1922
        ... 
4220    2017
4221    2017
4222    2017
4223    2016
4224    2017
Name: release_date, Length: 4225, dtype: int64

In [5]:
bec_results['passes_test'] = np.where(bec_results['rating'] == 3, "Passes", "Fails")
bec_results['passes_test_binary'] = np.where(bec_results['passes_test'] == "Passes", 1, 0)

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [10]:
X = bec_results[['year', 'runtime',
        'Western', 'Romance', 'War', 'Crime', 'TV Movie',
       'Foreign', 'History', 'Thriller', 'Action', 'Horror', 'Fantasy',
       'Animation', 'Adventure', 'Mystery', 'Drama', 'Family', 'Music',
       'Comedy', 'avg_cast_gender', 'directing_avg_gender',
       'production_avg_gender', 'writing_avg_gender']]  # predictors
y = bec_results['passes_test_binary']  # binary target

# Step 1: Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 2: Train/test split (optional, good for evaluation)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 3: Fit L1 logistic regression
model = LogisticRegression(penalty='l1', solver='liblinear', C=1.0)  # C is inverse of regularization strength
model.fit(X_train, y_train)

# Step 4: Get important features
coef = model.coef_[0]
feature_names = X.columns
important_features = pd.Series(coef, index=feature_names)
selected_features = important_features[important_features != 0]

print("Selected features:")
print(selected_features.sort_values(key=abs, ascending=False))

Selected features:
avg_cast_gender         -0.926474
writing_avg_gender      -0.380830
Horror                   0.226673
year                     0.128687
Family                   0.103722
Drama                   -0.090108
Action                  -0.082590
Crime                   -0.082566
directing_avg_gender    -0.059826
Thriller                -0.053512
runtime                  0.047785
Fantasy                  0.041832
Mystery                  0.035888
Romance                  0.032904
Western                 -0.031887
History                  0.029380
Adventure               -0.029146
Comedy                  -0.025006
TV Movie                 0.019805
Foreign                  0.018233
production_avg_gender   -0.011765
Animation               -0.011534
War                     -0.010251
Music                    0.004866
dtype: float64


In [12]:
coefficients = model.coef_[0]  # for binary classification, it's a 1D array

# Create a Pandas Series for easy inspection
coef_series = pd.Series(coefficients, index=feature_names)


excluded_features = coef_series[coef_series == 0.0]
print("Features excluded (coef = 0):")
print(excluded_features)

Features excluded (coef = 0):
Series([], dtype: float64)
