In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
import re

In [3]:
df_movies = pd.read_csv("movies.csv")
df_ratings = pd.read_csv("ratings.csv")

In [4]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [8]:
df_ratings.drop(columns="timestamp").to_csv("ratings_without_timestamp.csv", index=False)

In [9]:
df_movies = pd.read_csv("movies.csv")
df_ratings = pd.read_csv("ratings_without_timestamp.csv")

In [10]:
df_movies.describe()

Unnamed: 0,movieId
count,9742.0
mean,42200.353623
std,52160.494854
min,1.0
25%,3248.25
50%,7300.0
75%,76232.0
max,193609.0


In [11]:
df_ratings.describe()

Unnamed: 0,userId,movieId,rating
count,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557
std,182.618491,35530.987199,1.042529
min,1.0,1.0,0.5
25%,177.0,1199.0,3.0
50%,325.0,2991.0,3.5
75%,477.0,8122.0,4.0
max,610.0,193609.0,5.0


In [12]:
titles = df_movies['title']

In [13]:
titles_only = []
years_only = []

for i in titles:
    match = re.search(r'\((\d{4})\)', i)
    if match:
        titles_only.append(re.sub(r'\s*\(\d{4}\)$', '', i).strip())
        years_only.append(match.group(1))
    else:
        titles_only.append(i.strip())
        years_only.append('0000')

In [14]:
titles_only[:5]

['Toy Story',
 'Jumanji',
 'Grumpier Old Men',
 'Waiting to Exhale',
 'Father of the Bride Part II']

In [15]:
years_only[:5]

['1995', '1995', '1995', '1995', '1995']

In [59]:
np.unique(years_only, return_counts=True)

(array(['0000', '1874', '1878', '1880', '1883', '1887', '1888', '1890',
        '1891', '1892', '1894', '1895', '1896', '1897', '1898', '1899',
        '1900', '1901', '1902', '1903', '1904', '1905', '1906', '1907',
        '1908', '1909', '1910', '1911', '1912', '1913', '1914', '1915',
        '1916', '1917', '1918', '1919', '1920', '1921', '1922', '1923',
        '1924', '1925', '1926', '1927', '1928', '1929', '1930', '1931',
        '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939',
        '1940', '1941', '1942', '1943', '1944', '1945', '1946', '1947',
        '1948', '1949', '1950', '1951', '1952', '1953', '1954', '1955',
        '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963',
        '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971',
        '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979',
        '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987',
        '1988', '1989', '1990', '1991', '1992', '1993', '1994', 

In [16]:
df_movies_temp = df_movies.drop(columns=['title'])

In [17]:
df_movies_temp

Unnamed: 0,movieId,genres
0,1,Adventure|Animation|Children|Comedy|Fantasy
1,2,Adventure|Children|Fantasy
2,3,Comedy|Romance
3,4,Comedy|Drama|Romance
4,5,Comedy
...,...,...
9737,193581,Action|Animation|Comedy|Fantasy
9738,193583,Animation|Comedy|Fantasy
9739,193585,Drama
9740,193587,Action|Animation


In [18]:
df_movies_temp.insert(1,'title', titles_only)
df_movies_temp.insert(2,'year', years_only)

In [19]:
df_movies_temp

Unnamed: 0,movieId,title,year,genres
0,1,Toy Story,1995,Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji,1995,Adventure|Children|Fantasy
2,3,Grumpier Old Men,1995,Comedy|Romance
3,4,Waiting to Exhale,1995,Comedy|Drama|Romance
4,5,Father of the Bride Part II,1995,Comedy
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,2017,Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero,2017,Animation|Comedy|Fantasy
9739,193585,Flint,2017,Drama
9740,193587,Bungo Stray Dogs: Dead Apple,2018,Action|Animation


In [20]:
genres = df_movies_temp['genres']

In [21]:
np.unique(genres)

array(['(no genres listed)', 'Action', 'Action|Adventure',
       'Action|Adventure|Animation',
       'Action|Adventure|Animation|Children',
       'Action|Adventure|Animation|Children|Comedy',
       'Action|Adventure|Animation|Children|Comedy|Fantasy',
       'Action|Adventure|Animation|Children|Comedy|IMAX',
       'Action|Adventure|Animation|Children|Comedy|Romance',
       'Action|Adventure|Animation|Children|Comedy|Sci-Fi',
       'Action|Adventure|Animation|Children|Comedy|Sci-Fi|IMAX',
       'Action|Adventure|Animation|Children|Comedy|Western',
       'Action|Adventure|Animation|Children|Fantasy',
       'Action|Adventure|Animation|Children|Fantasy|Sci-Fi',
       'Action|Adventure|Animation|Children|Sci-Fi',
       'Action|Adventure|Animation|Comedy',
       'Action|Adventure|Animation|Comedy|Crime|Mystery',
       'Action|Adventure|Animation|Comedy|Fantasy|Mystery|Sci-Fi',
       'Action|Adventure|Animation|Comedy|Fantasy|Sci-Fi',
       'Action|Adventure|Animation|Comedy|S

In [22]:
genres_split = genres.str.get_dummies(sep='|')

In [23]:
genres_split

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,0,1,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
9738,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
9739,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
9740,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [24]:
df_movies_final = pd.concat([df_movies_temp.drop(columns=['genres']), genres_split], axis=1)

In [26]:
df_movies_final.head()

Unnamed: 0,movieId,title,year,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story,1995,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji,1995,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men,1995,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale,1995,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II,1995,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
df_movies_final.to_csv("df_movies_final.csv", index=False)