In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import sigmoid_kernel
from joblib import dump, load
from sklearn.model_selection import train_test_split


In [2]:
movies = pd.read_csv('movie.csv')

In [3]:
movies.head()

Unnamed: 0.1,Unnamed: 0,genres,id,imdb_id,original_language,overview,release_date,runtime,status,title,vote_average,vote_count
0,23651,"[{'id': 18, 'name': 'Drama'}]",50032,tt0853153,en,Story about the remarkable friendship between ...,2007-09-23,92.0,Released,Stuart: A Life Backwards,7.7,34.0
1,23350,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",105945,tt1694118,en,"Memorial Day, 1993. When 13-year-old Kyle Voge...",2011-06-06,108.0,Released,Memorial Day,5.8,20.0
2,12189,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",6639,tt0484740,en,"In Colombia just after the Great War, an old m...",2007-10-04,139.0,Released,Love in the Time of Cholera,6.5,84.0
3,9089,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...",27994,tt0117609,en,Harry Griswald is a NYPD cop who is possessed ...,1991-01-01,105.0,Released,Sgt. Kabukiman N.Y.P.D.,6.1,16.0
4,10744,"[{'id': 18, 'name': 'Drama'}, {'id': 36, 'name...",2117,tt0426578,de,True story of Germany's most famous anti-Nazi ...,2005-02-13,117.0,Released,Sophie Scholl: The Final Days,7.1,109.0


In [4]:
movies['genres'].head()

0                        [{'id': 18, 'name': 'Drama'}]
1    [{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...
2    [{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...
3    [{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...
4    [{'id': 18, 'name': 'Drama'}, {'id': 36, 'name...
Name: genres, dtype: object

In [5]:
movies.drop(columns = 'Unnamed: 0', inplace=True)

In [6]:
movies["title"] = movies["title"].str.lower()

In [7]:
movies.head()

Unnamed: 0,genres,id,imdb_id,original_language,overview,release_date,runtime,status,title,vote_average,vote_count
0,"[{'id': 18, 'name': 'Drama'}]",50032,tt0853153,en,Story about the remarkable friendship between ...,2007-09-23,92.0,Released,stuart: a life backwards,7.7,34.0
1,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",105945,tt1694118,en,"Memorial Day, 1993. When 13-year-old Kyle Voge...",2011-06-06,108.0,Released,memorial day,5.8,20.0
2,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",6639,tt0484740,en,"In Colombia just after the Great War, an old m...",2007-10-04,139.0,Released,love in the time of cholera,6.5,84.0
3,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...",27994,tt0117609,en,Harry Griswald is a NYPD cop who is possessed ...,1991-01-01,105.0,Released,sgt. kabukiman n.y.p.d.,6.1,16.0
4,"[{'id': 18, 'name': 'Drama'}, {'id': 36, 'name...",2117,tt0426578,de,True story of Germany's most famous anti-Nazi ...,2005-02-13,117.0,Released,sophie scholl: the final days,7.1,109.0


In [8]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36372 entries, 0 to 36371
Data columns (total 11 columns):
genres               36372 non-null object
id                   36372 non-null object
imdb_id              36358 non-null object
original_language    36363 non-null object
overview             35618 non-null object
release_date         36301 non-null object
runtime              36168 non-null float64
status               36308 non-null object
title                36367 non-null object
vote_average         36367 non-null float64
vote_count           36367 non-null float64
dtypes: float64(3), object(8)
memory usage: 3.1+ MB


In [9]:
movies.nunique()

genres                3558
id                   36351
imdb_id              36336
original_language       87
overview             35462
release_date         15417
runtime                331
status                   6
title                34185
vote_average            91
vote_count            1628
dtype: int64

In [10]:
movies.shape

(36372, 11)

In [11]:
movies.drop_duplicates(keep='first',inplace = True)

In [12]:
movies.shape

(36353, 11)

In [13]:
movies.nunique()

genres                3558
id                   36351
imdb_id              36336
original_language       87
overview             35462
release_date         15417
runtime                331
status                   6
title                34185
vote_average            91
vote_count            1628
dtype: int64

In [14]:
 movies.drop_duplicates(subset='title', keep="first",inplace = True)

In [15]:
movies.shape

(34186, 11)

In [16]:
movies.nunique()

genres                3400
id                   34186
imdb_id              34173
original_language       85
overview             33327
release_date         14883
runtime                323
status                   6
title                34185
vote_average            90
vote_count            1569
dtype: int64

In [17]:
movies = movies.astype({'id': 'int64'})

In [18]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34186 entries, 0 to 36371
Data columns (total 11 columns):
genres               34186 non-null object
id                   34186 non-null int64
imdb_id              34173 non-null object
original_language    34177 non-null object
overview             33453 non-null object
release_date         34118 non-null object
runtime              33989 non-null float64
status               34129 non-null object
title                34185 non-null object
vote_average         34185 non-null float64
vote_count           34185 non-null float64
dtypes: float64(3), int64(1), object(7)
memory usage: 3.1+ MB


In [19]:
movies = movies.sort_values(by='id').reset_index(drop=True)

In [20]:
movies

Unnamed: 0,genres,id,imdb_id,original_language,overview,release_date,runtime,status,title,vote_average,vote_count
0,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",3,tt0092149,fi,"An episode in the life of Nikander, a garbage ...",1986-10-16,76.0,Released,shadows in paradise,7.1,35.0
1,"[{'id': 80, 'name': 'Crime'}, {'id': 35, 'name...",5,tt0113101,en,It's Ted the Bellhop's first night on the job....,1995-12-09,98.0,Released,four rooms,6.5,539.0
2,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",6,tt0107286,en,"While racing to a boxing match, Frank, Mike, J...",1993-10-15,110.0,Released,judgment night,6.4,79.0
3,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",11,tt0076759,en,Princess Leia is captured and held hostage by ...,1977-05-25,121.0,Released,star wars,8.1,6778.0
4,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",13,tt0109830,en,A man with a low IQ has accomplished great thi...,1994-07-06,142.0,Released,forrest gump,8.2,8147.0
5,"[{'id': 18, 'name': 'Drama'}]",14,tt0169547,en,"Lester Burnham, a depressed suburban father in...",1999-09-15,122.0,Released,american beauty,7.9,3438.0
6,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",16,tt0168629,en,"Selma, a Czech immigrant on the verge of blind...",2000-05-17,140.0,Released,dancer in the dark,7.7,392.0
7,"[{'id': 27, 'name': 'Horror'}, {'id': 53, 'nam...",17,tt0411267,en,Adèle and her daughter Sarah are traveling on ...,2006-01-26,87.0,Released,the dark,5.6,76.0
8,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",18,tt0119116,en,"In 2257, a taxi driver is unintentionally give...",1997-05-07,126.0,Released,the fifth element,7.3,3962.0
9,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",20,tt0314412,en,A Pedro Almodovar production in which a fatall...,2003-03-07,106.0,Released,my life without me,7.2,78.0


In [21]:
train = pd.DataFrame()


train['id'] = movies['id']

In [22]:
train['genres'] = movies['genres']

In [23]:
train['title'] = movies['title']

In [24]:
train.head()

Unnamed: 0,id,genres,title
0,3,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",shadows in paradise
1,5,"[{'id': 80, 'name': 'Crime'}, {'id': 35, 'name...",four rooms
2,6,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",judgment night
3,11,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",star wars
4,13,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",forrest gump


In [25]:
train.head()

Unnamed: 0,id,genres,title
0,3,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",shadows in paradise
1,5,"[{'id': 80, 'name': 'Crime'}, {'id': 35, 'name...",four rooms
2,6,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",judgment night
3,11,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",star wars
4,13,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",forrest gump


In [26]:
genre = pd.DataFrame(train['genres'])

In [27]:
import unicodedata
import ast
from ast import literal_eval
import re
def clean_data(value):
    value =  unicodedata.normalize('NFD', value).encode('ascii', 'ignore').decode('ascii')
    x= str.lower(re.sub(r"[^a-zA-Z0-9]+", '_', value.strip().replace(" ","_")))
    if(x!='_'):
        return x 
    else:
        return " " 
    

In [28]:
train['genres']=train['genres'].apply(lambda s: list(ast.literal_eval(s)))
train['genres']=train['genres'].apply((lambda genres : [ clean_data(genre['name'])  for genre in genres]))

In [29]:
train.head()

Unnamed: 0,id,genres,title
0,3,"[drama, comedy]",shadows in paradise
1,5,"[crime, comedy]",four rooms
2,6,"[action, thriller, crime]",judgment night
3,11,"[adventure, action, science_fiction]",star wars
4,13,"[comedy, drama, romance]",forrest gump


In [30]:
train["Comedy"] = np.nan
train["Crime"] = np.nan
train["Drama"] = np.nan
train["Horror"] = np.nan

In [31]:
train = train.astype({'genres': 'str'})

#static variable
class Foo(object):
    counter = 0

    def __call__(self):
        Foo.counter += 1
        return Foo.counter

# Create an object instance of class "Foo," called "foo"
foo = Foo()

# Make calls to the "__call__" method, via the object's name itself

#seprating
z =0
def m(x):
    li =[]
    global z
    s = ''
    for i in x:
        li2 = []
        if i=='[':
            continue
        elif i == ',' or i==']':
            li2.append(''.join(li))
            li=[]
            s = li2[0]
            if "drama" in s:
                train.at[z,'Drama'] = 1.0
            if "comedy" in s:
                train.at[z,'Comedy'] = 1.0
            if "crime" in s:
                train.at[z,'Crime'] = 1.0
            if "horror" in s:
                train.at[z,'Horror'] = 1.0
            if "adventure" in s:
                train.at[z,'Adventure'] = 1.0
            if "action" in s:
                train.at[z,'Action'] = 1.0
            if "romance" in s:
                train.at[z,'Romance'] = 1.0
            if "fantasy" in s:
                train.at[z,'Fantasy'] = 1.0
            if "music" in s:
                train.at[z,'Music'] = 1.0
            if "science_fiction" in s:
                train.at[z,'Science_Fiction'] = 1.0
            if "war" in s:
                train.at[z,'War'] = 1.0
            if "documentary" in s:
                train.at[z,'Documentary'] = 1.0
            if "thriller" in s:
                train.at[z,'Thriller'] = 1.0
            if "animation" in s:
                train.at[z,'Animation'] = 1.0
            if "family" in s:
                train.at[z,'Family'] = 1.0
            if "mystery" in s:
                train.at[z,'Mystery'] = 1.0
            if "foreign" in s:
                train.at[z,'Foreign'] = 1.0
            if "history" in s:
                train.at[z,'History'] = 1.0
            if "tv movie" in s:
                train.at[z,'TV Movie'] = 1.0
            if "western" in s:
                train.at[z,'Western'] = 1.0
                
                
                
            if i == ']':
                z=foo()
            
        else:
            li.append(i)    
            
train['genres'].map(m)

0        None
1        None
2        None
3        None
4        None
5        None
6        None
7        None
8        None
9        None
10       None
11       None
12       None
13       None
14       None
15       None
16       None
17       None
18       None
19       None
20       None
21       None
22       None
23       None
24       None
25       None
26       None
27       None
28       None
29       None
         ... 
34156    None
34157    None
34158    None
34159    None
34160    None
34161    None
34162    None
34163    None
34164    None
34165    None
34166    None
34167    None
34168    None
34169    None
34170    None
34171    None
34172    None
34173    None
34174    None
34175    None
34176    None
34177    None
34178    None
34179    None
34180    None
34181    None
34182    None
34183    None
34184    None
34185    None
Name: genres, Length: 34186, dtype: object

In [32]:
train.head(50)

Unnamed: 0,id,genres,title,Comedy,Crime,Drama,Horror,Action,Thriller,Adventure,...,Music,Mystery,Fantasy,Documentary,War,Western,History,Animation,Family,Foreign
0,3,"['drama', 'comedy']",shadows in paradise,1.0,,1.0,,,,,...,,,,,,,,,,
1,5,"['crime', 'comedy']",four rooms,1.0,1.0,,,,,,...,,,,,,,,,,
2,6,"['action', 'thriller', 'crime']",judgment night,,1.0,,,1.0,1.0,,...,,,,,,,,,,
3,11,"['adventure', 'action', 'science_fiction']",star wars,,,,,1.0,,1.0,...,,,,,,,,,,
4,13,"['comedy', 'drama', 'romance']",forrest gump,1.0,,1.0,,,,,...,,,,,,,,,,
5,14,['drama'],american beauty,,,1.0,,,,,...,,,,,,,,,,
6,16,"['drama', 'crime', 'music']",dancer in the dark,,1.0,1.0,,,,,...,1.0,,,,,,,,,
7,17,"['horror', 'thriller', 'mystery']",the dark,,,,1.0,,1.0,,...,,1.0,,,,,,,,
8,18,"['adventure', 'fantasy', 'action', 'thriller',...",the fifth element,,,,,1.0,1.0,1.0,...,,,1.0,,,,,,,
9,20,"['drama', 'romance']",my life without me,,,1.0,,,,,...,,,,,,,,,,


In [33]:
train.set_index(train['title'],inplace=True)

In [34]:
train = train.drop(columns=['title','genres','id'])

### train.info()

In [35]:
train.fillna(0,inplace = True)

In [36]:
train.head()

Unnamed: 0_level_0,Comedy,Crime,Drama,Horror,Action,Thriller,Adventure,Science_Fiction,Romance,Music,Mystery,Fantasy,Documentary,War,Western,History,Animation,Family,Foreign
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
shadows in paradise,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
four rooms,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
judgment night,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
star wars,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
forrest gump,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
train = train.T

In [38]:
train.head()

title,shadows in paradise,four rooms,judgment night,star wars,forrest gump,american beauty,dancer in the dark,the dark,the fifth element,my life without me,...,the everyday,rakka,edith walks,chris d'elia: man on fire,firebase,the truth is in the stars,abduction,tragedy in a temporary town,silja - nuorena nukkunut,manuel on the island of wonders
Comedy,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
Crime,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Drama,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
Horror,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Action,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [39]:
correlation = train.corr()

In [40]:
correlation

title,shadows in paradise,four rooms,judgment night,star wars,forrest gump,american beauty,dancer in the dark,the dark,the fifth element,my life without me,...,the everyday,rakka,edith walks,chris d'elia: man on fire,firebase,the truth is in the stars,abduction,tragedy in a temporary town,silja - nuorena nukkunut,manuel on the island of wonders
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
shadows in paradise,1.000000,0.441176,-0.148522,-0.148522,0.792118,0.687184,0.321798,-0.148522,-0.204980,0.441176,...,-0.080845,-0.117647,-0.080845,0.687184,-0.148522,-0.080845,0.441176,0.687184,0.441176,0.441176
four rooms,0.441176,1.000000,0.321798,-0.148522,0.321798,-0.080845,0.321798,-0.148522,-0.204980,-0.117647,...,-0.080845,-0.117647,-0.080845,0.687184,-0.148522,-0.080845,-0.117647,-0.080845,-0.117647,-0.117647
judgment night,-0.148522,0.321798,1.000000,0.208333,-0.187500,-0.102062,0.208333,0.208333,0.396788,-0.148522,...,-0.102062,0.321798,-0.102062,-0.102062,0.208333,-0.102062,-0.148522,-0.102062,-0.148522,-0.148522
star wars,-0.148522,-0.148522,0.208333,1.000000,-0.187500,-0.102062,-0.187500,-0.187500,0.724569,-0.148522,...,-0.102062,0.792118,-0.102062,-0.102062,0.604167,-0.102062,-0.148522,-0.102062,-0.148522,-0.148522
forrest gump,0.792118,0.321798,-0.187500,-0.187500,1.000000,0.544331,0.208333,-0.187500,-0.258775,0.792118,...,-0.102062,-0.148522,-0.102062,0.544331,-0.187500,-0.102062,0.321798,0.544331,0.792118,0.321798
american beauty,0.687184,-0.080845,-0.102062,-0.102062,0.544331,1.000000,0.544331,-0.102062,-0.140859,0.687184,...,-0.055556,-0.080845,-0.055556,-0.055556,-0.102062,-0.055556,0.687184,1.000000,0.687184,0.687184
dancer in the dark,0.321798,0.321798,0.208333,-0.187500,0.208333,0.544331,1.000000,-0.187500,-0.258775,0.321798,...,-0.102062,-0.148522,-0.102062,-0.102062,-0.187500,-0.102062,0.321798,0.544331,0.321798,0.321798
the dark,-0.148522,-0.148522,0.208333,-0.187500,-0.187500,-0.102062,-0.187500,1.000000,0.069007,-0.148522,...,-0.102062,-0.148522,-0.102062,-0.102062,-0.187500,-0.102062,-0.148522,-0.102062,-0.148522,-0.148522
the fifth element,-0.204980,-0.204980,0.396788,0.724569,-0.258775,-0.140859,-0.258775,0.069007,1.000000,-0.204980,...,-0.140859,0.573944,-0.140859,-0.140859,0.396788,-0.140859,0.184482,-0.140859,-0.204980,0.184482
my life without me,0.441176,-0.117647,-0.148522,-0.148522,0.792118,0.687184,0.321798,-0.148522,-0.204980,1.000000,...,-0.080845,-0.117647,-0.080845,-0.080845,-0.148522,-0.080845,0.441176,0.687184,1.000000,0.441176


In [41]:
min(correlation['shadows in paradise'])

-0.2619684159977922

In [42]:
#correlation.to_pickle(compression='zip')

In [49]:
sugesstion= correlation['get out'].nlargest(n=20, keep='first')

In [50]:
train['get out']

Comedy             1.0
Crime              0.0
Drama              0.0
Horror             0.0
Action             0.0
Thriller           0.0
Adventure          0.0
Science_Fiction    0.0
Romance            0.0
Music              0.0
Mystery            0.0
Fantasy            0.0
Documentary        0.0
War                0.0
Western            0.0
History            0.0
Animation          1.0
Family             0.0
Foreign            0.0
Name: get out, dtype: float64

In [51]:
sugesstion

title
beavis and butt-head do america                         1.0
lissi and the wild emperor                              1.0
werner - volles rooäää!!!                               1.0
kleines arschloch - der film                            1.0
werner - das muss kesseln!!!                            1.0
dieter - the movie                                      1.0
werner - beinhart!                                      1.0
happy feet                                              1.0
big buck bunny                                          1.0
werner - gekotzt wird später!                           1.0
terkel in trouble                                       1.0
aqua teen hunger force colon movie film for theaters    1.0
eight crazy nights                                      1.0
the nine lives of fritz the cat                         1.0
futurama: into the wild green yonder                    1.0
south park: imaginationland                             1.0
the spirit of christmas           

In [53]:
correlation.size

1168682596

In [57]:
movies.iloc[15000:15001]

Unnamed: 0,genres,id,imdb_id,original_language,overview,release_date,runtime,status,title,vote_average,vote_count
15000,"[{'id': 80, 'name': 'Crime'}]",49498,tt0047962,en,Undercover agents investigate a murderer in Ch...,1955-11-30,77.0,Released,the crooked web,5.8,4.0


In [59]:
left = correlation.iloc[ : , 0:15001]

In [60]:
right = correlation.iloc[ : , 15001:]

In [68]:
right

title,comrades,so evil my love,jim jefferies: alcoholocaust,i was a communist for the fbi,chinese coffee,gen-y cops,the change-up,man of steel,nativity!,r.i.p.d.,...,the everyday,rakka,edith walks,chris d'elia: man on fire,firebase,the truth is in the stars,abduction,tragedy in a temporary town,silja - nuorena nukkunut,manuel on the island of wonders
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
shadows in paradise,0.687184,0.687184,0.687184,-0.080845,0.687184,-0.080845,0.687184,-0.177123,0.441176,0.243544,...,-0.080845,-0.117647,-0.080845,0.687184,-0.148522,-0.080845,0.441176,0.687184,0.441176,0.441176
four rooms,-0.080845,-0.080845,0.687184,0.687184,-0.080845,-0.080845,0.687184,-0.177123,0.441176,0.664211,...,-0.080845,-0.117647,-0.080845,0.687184,-0.148522,-0.080845,-0.117647,-0.080845,-0.117647,-0.117647
judgment night,-0.102062,-0.102062,-0.102062,0.544331,-0.102062,0.544331,-0.102062,0.130437,-0.148522,0.484481,...,-0.102062,0.321798,-0.102062,-0.102062,0.208333,-0.102062,-0.148522,-0.102062,-0.148522,-0.148522
star wars,-0.102062,-0.102062,-0.102062,-0.102062,-0.102062,0.544331,-0.102062,0.838525,-0.148522,0.130437,...,-0.102062,0.792118,-0.102062,-0.102062,0.604167,-0.102062,-0.148522,-0.102062,-0.148522,-0.148522
forrest gump,0.544331,0.544331,0.544331,-0.102062,0.544331,-0.102062,0.544331,-0.223607,0.321798,0.130437,...,-0.102062,-0.148522,-0.102062,0.544331,-0.187500,-0.102062,0.321798,0.544331,0.792118,0.321798
american beauty,1.000000,1.000000,-0.055556,-0.055556,1.000000,-0.055556,-0.055556,-0.121716,-0.080845,-0.121716,...,-0.055556,-0.080845,-0.055556,-0.055556,-0.102062,-0.055556,0.687184,1.000000,0.687184,0.687184
dancer in the dark,0.544331,0.544331,-0.102062,0.544331,0.544331,-0.102062,-0.102062,-0.223607,-0.148522,0.130437,...,-0.102062,-0.148522,-0.102062,-0.102062,-0.187500,-0.102062,0.321798,0.544331,0.321798,0.321798
the dark,-0.102062,-0.102062,-0.102062,-0.102062,-0.102062,-0.102062,-0.102062,-0.223607,-0.148522,-0.223607,...,-0.102062,-0.148522,-0.102062,-0.102062,-0.187500,-0.102062,-0.148522,-0.102062,-0.148522,-0.148522
the fifth element,-0.140859,-0.140859,-0.140859,-0.140859,-0.140859,0.394405,-0.140859,0.864099,-0.204980,0.277746,...,-0.140859,0.573944,-0.140859,-0.140859,0.396788,-0.140859,0.184482,-0.140859,-0.204980,0.184482
my life without me,0.687184,0.687184,-0.080845,-0.080845,0.687184,-0.080845,-0.080845,-0.177123,-0.117647,-0.177123,...,-0.080845,-0.117647,-0.080845,-0.080845,-0.148522,-0.080845,0.441176,0.687184,1.000000,0.441176


In [64]:
upper_left = left.iloc[0:15001, :]

In [65]:
lower_left = left.iloc[15001:, :]

In [67]:
upper_left.size

225030001

In [69]:
upper_right = right.iloc[0:15001, :]

In [70]:
lower_right = right.iloc[15001:, :]

In [71]:
print(upper_left.size)
print(lower_left.size)
print(upper_right.size)
print(lower_right.size)

225030001
287794185
287794185
368064225


In [73]:
upper_left.to_csv('upper_left.csv')