# Logisitic Regression

## Loading the dataset

In [1]:
import os

import pandas as pd

import holcrawl.shared

In [2]:
dataset_dir = holcrawl.shared._get_dataset_dir_path()

In [3]:
dataset_path = os.path.join(dataset_dir, 'movies_dataset.csv')

In [4]:
df = pd.read_csv(dataset_path)

## Feature Generation

In [5]:
df['total_screens'] = df['avg_screens'] * df['num_weekends'] 

In [6]:
df['norm_gross'] = df['gross_income'] / df['budget']

In [23]:
df['failed'] = df['norm_gross'] < 1

In [7]:
df['starting_letter'] = df['name'].map(lambda name: name[0].lower())

In [8]:
df['name_lenth'] = df['name'].map(lambda name: len(name))

In [9]:
# df['opening_weekend_date']

In [39]:
FEAT_TO_KEEP = [
    'duration', 'starting_letter', 'budget', 'opening_month', 'opening_day', 'opening_day_of_year', 'year',
    'avg_mc_critic_by_opening', 'failed', 'name_lenth'  # 'avg_mc_user_by_opening'
]

In [40]:
FEAT_TO_KEEP += [col for col in df.columns if 'genres' in col]

In [41]:
dataset = df.drop([col for col in df.columns if col not in FEAT_TO_KEEP], axis=1)

In [42]:
letter_dummies = pd.get_dummies(dataset['starting_letter'], drop_first=True, prefix='fl')

In [43]:
dataset = dataset.assign(**{col: letter_dummies[col] for col in letter_dummies.columns})

In [44]:
dataset = dataset.drop('starting_letter', axis=1)

In [51]:
dataset = dataset.dropna(axis=0)

In [52]:
X = dataset.drop('failed', axis=1)
Y = dataset['failed']

In [53]:
pd.options.display.max_columns = 999
dataset

Unnamed: 0,budget,duration,year,genres.action,genres.adventure,genres.animation,genres.biography,genres.comedy,genres.crime,genres.documentary,genres.drama,genres.family,genres.fantasy,genres.history,genres.horror,genres.music,genres.musical,genres.mystery,genres.romance,genres.sci-fi,genres.sport,genres.thriller,genres.war,genres.western,avg_mc_critic_by_opening,opening_month,opening_day,opening_day_of_year,name_lenth,failed,fl_2,fl_3,fl_4,fl_5,fl_a,fl_b,fl_c,fl_d,fl_e,fl_f,fl_g,fl_h,fl_i,fl_j,fl_k,fl_l,fl_m,fl_n,fl_o,fl_p,fl_r,fl_s,fl_t,fl_u,fl_v,fl_w
0,20000000.0,134,2013,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,94.147059,10,18,291,16,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,4000000.0,93,2014,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,49.700000,4,18,108,7,True,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1000000.0,91,2013,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,82.600000,6,14,165,20,False,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,13000000.0,93,2013,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,38.500000,3,1,60,11,False,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,50000000.0,112,2014,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,72.844444,6,13,164,14,False,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,61000000.0,109,2013,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,58.540541,8,2,214,6,False,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,110000000.0,102,2014,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,49.833333,3,7,66,22,True,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,28000000.0,117,2014,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,45.523810,2,21,52,14,False,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,40000000.0,128,2013,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,67.257143,4,12,102,2,False,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,175000000.0,128,2013,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,31.777778,12,27,361,8,True,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [54]:
from sklearn.model_selection import train_test_split

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [56]:
from sklearn.linear_model import LogisticRegression

In [58]:
model = LogisticRegression()
model = model.fit(X_train, y_train)

In [None]:
logreg.fit(X_train, y_train)