In [1]:
import json
import pandas as pd
import numpy as np
import json
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
from joblib import dump, load
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from mlxtend.plotting import plot_decision_regions

In [2]:
def load_tmdb_movies(path):
    df = pd.read_csv(path, parse_dates=['release_date'])
    df['release_date'] = pd.to_datetime(df['release_date']).apply(lambda x: x.date())
    json_columns = ['genres', 'keywords', 'production_countries', 'production_companies', 'spoken_languages']
    for column in json_columns:
        df[column] = df[column].apply(json.loads)
    return df

In [3]:
def load_tmdb_credits(path):
    df = pd.read_csv(path)
    json_columns = ['cast', 'crew']
    for column in json_columns:
        df[column] = df[column].apply(json.loads)
    return df

In [4]:
LOST_COLUMNS = [
    'actor_1_facebook_likes',
    'actor_2_facebook_likes',
    'actor_3_facebook_likes',
    'aspect_ratio',
    'cast_total_facebook_likes',
    'color',
    'content_rating',
    'director_facebook_likes',
    'facenumber_in_poster',
    'movie_facebook_likes',
    'movie_imdb_link',
    'num_critic_for_reviews',
    'num_user_for_reviews'
                ]

In [5]:
TMDB_TO_IMDB_SIMPLE_EQUIVALENCIES = {
    'budget': 'budget',
    'genres': 'genres',
    'revenue': 'gross',
    'title': 'movie_title',
    'runtime': 'duration',
    'original_language': 'language',  # it's possible that spoken_languages would be a better match
    'keywords': 'plot_keywords',
    'vote_count': 'num_voted_users',
                                         }

IMDB_COLUMNS_TO_REMAP = {'imdb_score': 'vote_average'}

In [6]:
def safe_access(container, index_values):
    # return a missing value rather than an error upon indexing/key failure
    result = container
    try:
        for idx in index_values:
            result = result[idx]
        return result
    except IndexError or KeyError:
        return pd.np.nan

In [7]:
def get_director(crew_data):
    directors = [x['name'] for x in crew_data if x['job'] == 'Director']
    return safe_access(directors, [0])

In [8]:
def get_companies(company_data):
    companies = [x['name'] for x in company_data if x['name'] == 'Director']
    return safe_access(directors, [0])

In [9]:
def pipe_flatten_names(keywords):
    return '|'.join([x['name'] for x in keywords])

In [10]:
def convert_to_original_format(movies, credits):
    # Converts TMDb data to make it as compatible as possible with kernels built on the original version of the data.
    tmdb_movies = movies.copy()
    tmdb_movies.rename(columns=TMDB_TO_IMDB_SIMPLE_EQUIVALENCIES, inplace=True)
    tmdb_movies['title_year'] = pd.to_datetime(tmdb_movies['release_date']).apply(lambda x: x.year)
    # I'm assuming that the first production country is equivalent, but have not been able to validate this
    tmdb_movies['country'] = tmdb_movies['production_countries'].apply(lambda x: safe_access(x, [0, 'name']))
    tmdb_movies['company'] = tmdb_movies['production_companies'].apply(lambda x: safe_access(x, [0, 'name']))
    tmdb_movies['language'] = tmdb_movies['spoken_languages'].apply(lambda x: safe_access(x, [0, 'name']))
    # tmdb_movies['production_companies'] = tmdb_movies['production_companies'].apply(lambda x: safe_access(x, [0, 'name']))
    tmdb_movies['director_name'] = credits['crew'].apply(get_director)
    tmdb_movies['actor_1_name'] = credits['cast'].apply(lambda x: safe_access(x, [0, 'name']))
    tmdb_movies['actor_2_name'] = credits['cast'].apply(lambda x: safe_access(x, [1, 'name']))
    tmdb_movies['actor_3_name'] = credits['cast'].apply(lambda x: safe_access(x, [2, 'name']))
    tmdb_movies['genres'] = tmdb_movies['genres'].apply(pipe_flatten_names)
    tmdb_movies['plot_keywords'] = tmdb_movies['plot_keywords'].apply(pipe_flatten_names)
    return tmdb_movies

In [11]:
movies = load_tmdb_movies("Data/Original/tmdb_5000_movies.csv")
credits = load_tmdb_credits("Data/tmdb_5000_credits.csv")
original_format = convert_to_original_format(movies, credits)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 5)
original_format

JSONDecodeError: Expecting ',' delimiter: line 1 column 869 (char 868)

In [12]:
df = original_format.copy()
features_to_remove =['id','original_title','overview', 'movie_title', 'homepage', 'status' , 'plot_keywords', 'spoken_languages', 'gross', 'title_year', 'production_countries','production_companies','tagline', 'vote_average','num_voted_users', 'genres']
df.drop(features_to_remove, axis = 1, inplace= True)
df.columns

Index(['budget', 'language', 'popularity', 'release_date', 'duration',
       'country', 'company', 'director_name', 'actor_1_name', 'actor_2_name',
       'actor_3_name'],
      dtype='object')

In [13]:
replace_with_zero = ['duration','title_year']

missing_val_cols = list(df.columns[df.isna().any()])
for col in missing_val_cols:
  if col not in replace_with_zero:
    miss_percentage = ((df[col].isna().sum())/4803) * 100 
    print(f'{col} : {str(miss_percentage)}')
    df[col].fillna('Missing', inplace=True)
  else:
    miss_percentage = ((df[col].isna().sum())/4803) * 100 
    print(f'{col} : {str(miss_percentage)}')
    df[col].fillna(df[col].mean(), inplace=True)

language : 1.7905475744326462
release_date : 0.020820320632937747
duration : 0.041640641265875494
country : 3.622735790131168
company : 7.30793254216115
director_name : 0.6246096189881324
actor_1_name : 0.8952737872163231
actor_2_name : 1.1034769935457007
actor_3_name : 1.311680199875078


In [14]:
df['release_date'] = pd.to_datetime(df['release_date'].astype(str),errors='coerce')
df['dayofrelease']=df['release_date'].dt.strftime('%A')
df.drop(columns=['release_date'], inplace = True)
df.head(3)

Unnamed: 0,budget,language,popularity,duration,country,company,director_name,actor_1_name,actor_2_name,actor_3_name,dayofrelease
0,237000000,English,150.437577,162.0,United States of America,Ingenious Film Partners,James Cameron,Sam Worthington,Zoe Saldana,Sigourney Weaver,Thursday
1,300000000,English,139.082615,169.0,United States of America,Walt Disney Pictures,Gore Verbinski,Johnny Depp,Orlando Bloom,Keira Knightley,Saturday
2,245000000,Français,107.376788,148.0,United Kingdom,Columbia Pictures,Sam Mendes,Daniel Craig,Christoph Waltz,Léa Seydoux,Monday


In [15]:
data_copy = df.copy()

nominal = ['dayofrelease', 'language']
one_hot = pd.get_dummies(data_copy[nominal])
data_copy.drop(['dayofrelease','language'], axis=1, inplace=True)
data_copy = data_copy.join(one_hot)
data_copy.head()

Unnamed: 0,budget,popularity,duration,country,company,director_name,actor_1_name,actor_2_name,actor_3_name,dayofrelease_Friday,dayofrelease_Monday,dayofrelease_Saturday,dayofrelease_Sunday,dayofrelease_Thursday,dayofrelease_Tuesday,dayofrelease_Wednesday,language_,language_??????,language_Afrikaans,language_Bahasa indonesia,language_Bosanski,language_Dansk,language_Deutsch,language_Eesti,language_English,language_Español,language_Esperanto,language_Français,language_Gaeilge,language_Galego,language_Hrvatski,language_Italiano,language_Kiswahili,language_Latin,language_Magyar,language_Missing,language_Nederlands,language_No Language,language_Norsk,language_Polski,language_Português,language_Pусский,language_Română,language_Srpski,language_Tiếng Việt,language_shqip,language_svenska,language_Íslenska,language_Český,language_ελληνικά,language_Український,language_български език,language_עִבְרִית,language_اردو,language_العربية,language_فارسی,language_हिन्दी,language_বাংলা,language_தமிழ்,language_ภาษาไทย,language_广州话 / 廣州話,language_日本語,language_普通话,language_한국어/조선말
0,237000000,150.437577,162.0,United States of America,Ingenious Film Partners,James Cameron,Sam Worthington,Zoe Saldana,Sigourney Weaver,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,300000000,139.082615,169.0,United States of America,Walt Disney Pictures,Gore Verbinski,Johnny Depp,Orlando Bloom,Keira Knightley,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,245000000,107.376788,148.0,United Kingdom,Columbia Pictures,Sam Mendes,Daniel Craig,Christoph Waltz,Léa Seydoux,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,250000000,112.31295,165.0,United States of America,Legendary Pictures,Christopher Nolan,Christian Bale,Michael Caine,Gary Oldman,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,260000000,43.926995,132.0,United States of America,Walt Disney Pictures,Andrew Stanton,Taylor Kitsch,Lynn Collins,Samantha Morton,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [16]:
# from sklearn.preprocessing import LabelEncoder

# ordinal = ['actor_1_name','actor_2_name','actor_3_name','director_name','country', 'company']
# for feature in ordinal:
#     data_copy[feature] = data_copy[feature].astype('category')
#     data_copy[feature] = data_copy[feature].cat.codes
# # pd.set_option('display.max_rows', None)

In [17]:
def get_integer_mapping(le):
    '''
    Return a dict mapping labels to their integer values
    from an SKlearn LabelEncoder
    le = a fitted SKlearn LabelEncoder
    '''
    res = {}
    for cl in le.classes_:
        res.update({cl:le.transform([cl])[0]})

    return res

In [18]:
# df.loc[df['actor_1_name'] == '']

In [19]:
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()
ordinal = ['actor_1_name','actor_2_name','actor_3_name','director_name','country', 'company']

for feature in ordinal:
    data_copy[feature] = labelencoder.fit_transform(data_copy[feature])
    integerMapping = get_integer_mapping(labelencoder)
    with open(f'{feature}.txt','w', encoding="utf-8") as data:
        data.write(str(integerMapping))
data_copy    

Unnamed: 0,budget,popularity,duration,country,company,director_name,actor_1_name,actor_2_name,actor_3_name,dayofrelease_Friday,dayofrelease_Monday,dayofrelease_Saturday,dayofrelease_Sunday,dayofrelease_Thursday,dayofrelease_Tuesday,dayofrelease_Wednesday,language_,language_??????,language_Afrikaans,language_Bahasa indonesia,language_Bosanski,language_Dansk,language_Deutsch,language_Eesti,language_English,language_Español,language_Esperanto,language_Français,language_Gaeilge,language_Galego,language_Hrvatski,language_Italiano,language_Kiswahili,language_Latin,language_Magyar,language_Missing,language_Nederlands,language_No Language,language_Norsk,language_Polski,language_Português,language_Pусский,language_Română,language_Srpski,language_Tiếng Việt,language_shqip,language_svenska,language_Íslenska,language_Český,language_ελληνικά,language_Український,language_български език,language_עִבְרִית,language_اردو,language_العربية,language_فارسی,language_हिन्दी,language_বাংলা,language_தமிழ்,language_ภาษาไทย,language_广州话 / 廣州話,language_日本語,language_普通话,language_한국어/조선말
0,237000000,150.437577,162.0,70,611,885,1760,2715,2725,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,300000000,139.082615,169.0,70,1258,769,991,2017,1674,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4801,0,0.857008,98.0,70,790,439,441,743,301,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4802,0,1.929883,90.0,70,1313,249,529,294,585,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [20]:
scaler = StandardScaler()

# numerical = df.select_dtypes(include=['int64','float']).copy()
numerical = data_copy.columns[data_copy.dtypes.apply(lambda c: np.issubdtype(c, np.number))]


data_copy[numerical] = scaler.fit_transform(data_copy[numerical])
data_copy.head()

Unnamed: 0,budget,popularity,duration,country,company,director_name,actor_1_name,actor_2_name,actor_3_name,dayofrelease_Friday,dayofrelease_Monday,dayofrelease_Saturday,dayofrelease_Sunday,dayofrelease_Thursday,dayofrelease_Tuesday,dayofrelease_Wednesday,language_,language_??????,language_Afrikaans,language_Bahasa indonesia,language_Bosanski,language_Dansk,language_Deutsch,language_Eesti,language_English,language_Español,language_Esperanto,language_Français,language_Gaeilge,language_Galego,language_Hrvatski,language_Italiano,language_Kiswahili,language_Latin,language_Magyar,language_Missing,language_Nederlands,language_No Language,language_Norsk,language_Polski,language_Português,language_Pусский,language_Română,language_Srpski,language_Tiếng Việt,language_shqip,language_svenska,language_Íslenska,language_Český,language_ελληνικά,language_Український,language_български език,language_עִבְרִית,language_اردو,language_العربية,language_فارسی,language_हिन्दी,language_বাংলা,language_தமிழ்,language_ภาษาไทย,language_广州话 / 廣州話,language_日本語,language_普通话,language_한국어/조선말
0,5.107181,4.053183,2.438596,0.566591,-0.404521,-0.47771,1.171365,1.746301,1.324216,-0.864029,-0.225798,-0.227828,-0.217527,2.059964,-0.280786,-0.445369,-0.050047,-0.014431,-0.032282,-0.014431,-0.014431,-0.050047,-0.113419,-0.014431,0.413391,-0.133418,-0.014431,-0.151668,-0.014431,-0.014431,-0.014431,-0.081897,-0.025,-0.040846,-0.028871,-0.135026,-0.035366,-0.02041,-0.028871,-0.025,-0.059599,-0.080599,-0.028871,-0.014431,-0.02041,-0.014431,-0.032282,-0.014431,-0.07928,-0.032282,-0.02041,-0.014431,-0.035366,-0.014431,-0.083176,-0.025,-0.067835,-0.014431,-0.02041,-0.028871,-0.076576,-0.069367,-0.081897,-0.040846
1,6.654402,3.696258,2.748263,0.566591,1.329514,-0.647522,-0.108215,0.850469,0.13841,-0.864029,-0.225798,4.389285,-0.217527,-0.485445,-0.280786,-0.445369,-0.050047,-0.014431,-0.032282,-0.014431,-0.014431,-0.050047,-0.113419,-0.014431,0.413391,-0.133418,-0.014431,-0.151668,-0.014431,-0.014431,-0.014431,-0.081897,-0.025,-0.040846,-0.028871,-0.135026,-0.035366,-0.02041,-0.028871,-0.025,-0.059599,-0.080599,-0.028871,-0.014431,-0.02041,-0.014431,-0.032282,-0.014431,-0.07928,-0.032282,-0.02041,-0.014431,-0.035366,-0.014431,-0.083176,-0.025,-0.067835,-0.014431,-0.02041,-0.028871,-0.076576,-0.069367,-0.081897,-0.040846
2,5.303653,2.699638,1.81926,0.517966,-1.339882,1.113538,-1.026717,-1.158094,0.403552,-0.864029,4.42874,-0.227828,-0.217527,-0.485445,-0.280786,-0.445369,-0.050047,-0.014431,-0.032282,-0.014431,-0.014431,-0.050047,-0.113419,-0.014431,-2.419016,-0.133418,-0.014431,6.593347,-0.014431,-0.014431,-0.014431,-0.081897,-0.025,-0.040846,-0.028871,-0.135026,-0.035366,-0.02041,-0.028871,-0.025,-0.059599,-0.080599,-0.028871,-0.014431,-0.02041,-0.014431,-0.032282,-0.014431,-0.07928,-0.032282,-0.02041,-0.014431,-0.035366,-0.014431,-0.083176,-0.025,-0.067835,-0.014431,-0.02041,-0.028871,-0.076576,-0.069367,-0.081897,-0.040846
3,5.426449,2.854798,2.57131,0.566591,-0.190112,-1.246253,-1.156506,0.602768,-0.62881,-0.864029,4.42874,-0.227828,-0.217527,-0.485445,-0.280786,-0.445369,-0.050047,-0.014431,-0.032282,-0.014431,-0.014431,-0.050047,-0.113419,-0.014431,0.413391,-0.133418,-0.014431,-0.151668,-0.014431,-0.014431,-0.014431,-0.081897,-0.025,-0.040846,-0.028871,-0.135026,-0.035366,-0.02041,-0.028871,-0.025,-0.059599,-0.080599,-0.028871,-0.014431,-0.02041,-0.014431,-0.032282,-0.014431,-0.07928,-0.032282,-0.02041,-0.014431,-0.035366,-0.014431,-0.083176,-0.025,-0.067835,-0.014431,-0.02041,-0.028871,-0.076576,-0.069367,-0.081897,-0.040846
4,5.672039,0.705198,1.111448,0.566591,1.329514,-1.637111,1.442589,0.365334,1.200107,-0.864029,-0.225798,-0.227828,-0.217527,-0.485445,-0.280786,2.245331,-0.050047,-0.014431,-0.032282,-0.014431,-0.014431,-0.050047,-0.113419,-0.014431,0.413391,-0.133418,-0.014431,-0.151668,-0.014431,-0.014431,-0.014431,-0.081897,-0.025,-0.040846,-0.028871,-0.135026,-0.035366,-0.02041,-0.028871,-0.025,-0.059599,-0.080599,-0.028871,-0.014431,-0.02041,-0.014431,-0.032282,-0.014431,-0.07928,-0.032282,-0.02041,-0.014431,-0.035366,-0.014431,-0.083176,-0.025,-0.067835,-0.014431,-0.02041,-0.028871,-0.076576,-0.069367,-0.081897,-0.040846


In [21]:
data_copy.drop(columns=['popularity'], inplace= True)
new_data = pd.concat([data_copy, df['popularity']], axis =1)

In [22]:
rf_data = data_copy.copy()
rf_data.dtypes

budget              float64
duration            float64
                     ...   
language_普通话        float64
language_한국어/조선말    float64
Length: 63, dtype: object

In [23]:
rf_data['Success'] = np.where(df['popularity'] >= df['popularity'].mean(), 1,0)
# knn_data.drop(columns =['standardised_popularity'], inplace= True)
rf_data

Unnamed: 0,budget,duration,country,company,director_name,actor_1_name,actor_2_name,actor_3_name,dayofrelease_Friday,dayofrelease_Monday,dayofrelease_Saturday,dayofrelease_Sunday,dayofrelease_Thursday,dayofrelease_Tuesday,dayofrelease_Wednesday,language_,language_??????,language_Afrikaans,language_Bahasa indonesia,language_Bosanski,language_Dansk,language_Deutsch,language_Eesti,language_English,language_Español,language_Esperanto,language_Français,language_Gaeilge,language_Galego,language_Hrvatski,language_Italiano,language_Kiswahili,language_Latin,language_Magyar,language_Missing,language_Nederlands,language_No Language,language_Norsk,language_Polski,language_Português,language_Pусский,language_Română,language_Srpski,language_Tiếng Việt,language_shqip,language_svenska,language_Íslenska,language_Český,language_ελληνικά,language_Український,language_български език,language_עִבְרִית,language_اردو,language_العربية,language_فارسی,language_हिन्दी,language_বাংলা,language_தமிழ்,language_ภาษาไทย,language_广州话 / 廣州話,language_日本語,language_普通话,language_한국어/조선말,Success
0,5.107181,2.438596,0.566591,-0.404521,-0.477710,1.171365,1.746301,1.324216,-0.864029,-0.225798,-0.227828,-0.217527,2.059964,-0.280786,-0.445369,-0.050047,-0.014431,-0.032282,-0.014431,-0.014431,-0.050047,-0.113419,-0.014431,0.413391,-0.133418,-0.014431,-0.151668,-0.014431,-0.014431,-0.014431,-0.081897,-0.025,-0.040846,-0.028871,-0.135026,-0.035366,-0.02041,-0.028871,-0.025,-0.059599,-0.080599,-0.028871,-0.014431,-0.02041,-0.014431,-0.032282,-0.014431,-0.07928,-0.032282,-0.02041,-0.014431,-0.035366,-0.014431,-0.083176,-0.025,-0.067835,-0.014431,-0.02041,-0.028871,-0.076576,-0.069367,-0.081897,-0.040846,1
1,6.654402,2.748263,0.566591,1.329514,-0.647522,-0.108215,0.850469,0.138410,-0.864029,-0.225798,4.389285,-0.217527,-0.485445,-0.280786,-0.445369,-0.050047,-0.014431,-0.032282,-0.014431,-0.014431,-0.050047,-0.113419,-0.014431,0.413391,-0.133418,-0.014431,-0.151668,-0.014431,-0.014431,-0.014431,-0.081897,-0.025,-0.040846,-0.028871,-0.135026,-0.035366,-0.02041,-0.028871,-0.025,-0.059599,-0.080599,-0.028871,-0.014431,-0.02041,-0.014431,-0.032282,-0.014431,-0.07928,-0.032282,-0.02041,-0.014431,-0.035366,-0.014431,-0.083176,-0.025,-0.067835,-0.014431,-0.02041,-0.028871,-0.076576,-0.069367,-0.081897,-0.040846,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4801,-0.713319,-0.392652,0.566591,0.075220,-1.130605,-1.023389,-0.784617,-1.410697,-0.864029,-0.225798,-0.227828,-0.217527,2.059964,-0.280786,-0.445369,-0.050047,-0.014431,-0.032282,-0.014431,-0.014431,-0.050047,-0.113419,-0.014431,0.413391,-0.133418,-0.014431,-0.151668,-0.014431,-0.014431,-0.014431,-0.081897,-0.025,-0.040846,-0.028871,-0.135026,-0.035366,-0.02041,-0.028871,-0.025,-0.059599,-0.080599,-0.028871,-0.014431,-0.02041,-0.014431,-0.032282,-0.014431,-0.07928,-0.032282,-0.02041,-0.014431,-0.035366,-0.014431,-0.083176,-0.025,-0.067835,-0.014431,-0.02041,-0.028871,-0.076576,-0.069367,-0.081897,-0.040846,0
4802,-0.713319,-0.746559,0.566591,1.476921,-1.408744,-0.876962,-1.360876,-1.090270,1.157369,-0.225798,-0.227828,-0.217527,-0.485445,-0.280786,-0.445369,-0.050047,-0.014431,-0.032282,-0.014431,-0.014431,-0.050047,-0.113419,-0.014431,0.413391,-0.133418,-0.014431,-0.151668,-0.014431,-0.014431,-0.014431,-0.081897,-0.025,-0.040846,-0.028871,-0.135026,-0.035366,-0.02041,-0.028871,-0.025,-0.059599,-0.080599,-0.028871,-0.014431,-0.02041,-0.014431,-0.032282,-0.014431,-0.07928,-0.032282,-0.02041,-0.014431,-0.035366,-0.014431,-0.083176,-0.025,-0.067835,-0.014431,-0.02041,-0.028871,-0.076576,-0.069367,-0.081897,-0.040846,0


In [24]:
rf_X = rf_data.iloc[:,:-1].values
rf_y = rf_data.iloc[:,63].values
rf_X

array([[ 5.10718073,  2.43859561,  0.56659144, ..., -0.06936653,
        -0.08189743, -0.04084611],
       [ 6.65440223,  2.74826337,  0.56659144, ..., -0.06936653,
        -0.08189743, -0.04084611],
       [ 5.3036533 ,  1.81926009,  0.51796572, ..., -0.06936653,
        -0.08189743, -0.04084611],
       ...,
       [-0.71331921,  0.58058904,  0.56659144, ..., -0.06936653,
        -0.08189743, -0.04084611],
       [-0.71331921, -0.39265249,  0.56659144, ..., -0.06936653,
        -0.08189743, -0.04084611],
       [-0.71331921, -0.7465585 ,  0.56659144, ..., -0.06936653,
        -0.08189743, -0.04084611]])

In [25]:
pca = PCA(n_components = 9)
rf_X = pca.fit_transform(rf_X)
rf_X

array([[-7.92456217e-01, -3.66585669e+00,  5.28298805e-01, ...,
         4.09363543e-01, -9.97857998e-01,  1.45640939e-02],
       [-1.00416709e+00, -4.16281391e+00,  1.58760152e+00, ...,
         1.76352407e+00, -1.98945044e+00, -1.36516074e-01],
       [ 2.92993012e+00, -3.59913230e+00,  2.39664666e+00, ...,
        -5.80378593e-01, -3.52661853e-01, -4.88798949e-01],
       ...,
       [ 2.81670743e-03,  1.62147117e-03, -1.12281794e+00, ...,
         7.54015287e-02, -7.07461746e-01,  1.88059196e+00],
       [-5.12681923e-01, -7.41095691e-01, -1.42875353e+00, ...,
         6.25490563e-01,  5.14499696e-01,  1.31979906e-01],
       [-1.11034233e+00,  1.13873725e+00,  4.54613970e-01, ...,
         8.14725038e-01,  4.49884135e-01,  4.76755420e-01]])

In [26]:
rfX_train, rfX_test, rfy_train, rfy_test = train_test_split(rf_X, rf_y, test_size=0.30)

In [27]:
classifier = RandomForestClassifier(n_estimators=85)
classifier.fit(rfX_train, rfy_train)


RandomForestClassifier(n_estimators=85)

In [28]:
y_pred = classifier.predict(rfX_test)

In [29]:
classifier.score(rfX_test, rfy_test)*100

72.93546148507981

In [30]:
# dump(classifier, 'RandomForest.joblib')