## Export data from the old Database, and import processed data in the new DB

In [1]:
# imports
import psycopg2
import numpy as np
import os
import pandas as pd
from dotenv import load_dotenv

In [27]:
load_dotenv()

DB_NAME_OLD = os.getenv("DB_NAME_OLD")
DB_NAME_NEW = os.getenv("DB_NAME_NEW")
DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")
PORT = os.getenv("PORT")
HOST_OLD = os.getenv("HOST_OLD")
HOST_NEW = os.getenv("HOST_NEW")

# Connect to the old database
connection_old = psycopg2.connect(
    user=DB_USER,
    password=DB_PASSWORD,
    host=HOST_OLD,
    port=PORT,
    database=DB_NAME_OLD
)

# Connect to the new database
connection_new = psycopg2.connect(
    user=DB_USER,
    password=DB_PASSWORD,
    host=HOST_NEW,
    port=PORT,
    database=DB_NAME_NEW
)

### User ratings

In [30]:
# Data from user_groa_ratings
cursor_old = connection_old.cursor()
query = """SELECT a.id, a.user_id, b.movie_id, a.date, a.rating
        FROM user_groa_ratings a, imdb_movies b
        WHERE (a.name=b.primary_title AND a.year=b.start_year) OR
                (a.name=b.original_title AND a.year=b.start_year)"""
cursor_old.execute(query)
user_groa_ratings = cursor_old.fetchall()
cursor_old.close()

id = []
user_id = []
movie_id = []
date = []
rating = []

for row in user_groa_ratings:
    id.append(row[0])
    user_id.append(row[1])
    movie_id.append(row[2])
    date.append(row[3])
    rating.append(row[4])
    
user_groa_ratings = pd.DataFrame({
    'id': id,
    'user_id': user_id,
    'movie_id': movie_id,
    'date': date,
    'rating': rating
})

# drop ALL duplicate values 
user_groa_ratings.drop_duplicates(subset =['id'], keep = False, inplace = True) 

# Drop the id
user_groa_ratings = user_groa_ratings.drop(['id'], axis=1)
user_groa_ratings.head()

Unnamed: 0,user_id,movie_id,date,rating
0,312517,1057500,2013-02-11,2.5
1,312517,1051904,2016-05-23,2.0
2,311074,1051904,2016-05-23,2.0
3,311074,1057500,2013-02-11,2.5
4,306626,10925772,2020-03-27,3.5


In [31]:
# Get the rating from user_groa_reviews
cursor_old = connection_old.cursor()
query = """SELECT a.id, a.user_id, b.movie_id, a.date, a.rating
        FROM user_groa_reviews a, imdb_movies b
        WHERE (a.name=b.primary_title AND a.year=b.start_year) OR
                (a.name=b.original_title AND a.year=b.start_year)"""
cursor_old.execute(query)
user_groa_ratings_rev = cursor_old.fetchall()
cursor_old.close()

id = []
user_id = []
movie_id = []
date = []
rating = []

for row in user_groa_ratings_rev:
    id.append(row[0])
    user_id.append(row[1])
    movie_id.append(row[2])
    date.append(row[3])
    rating.append(row[4])
    
user_groa_ratings_rev = pd.DataFrame({
    'id': id,
    'user_id': user_id,
    'movie_id': movie_id,
    'date': date,
    'rating': rating
})

# drop ALL duplicate values 
user_groa_ratings_rev.drop_duplicates(subset =['id'], keep = False, inplace = True)

# Drop the id
user_groa_ratings_rev = user_groa_ratings_rev.drop(['id'], axis=1)
user_groa_ratings_rev.head()

Unnamed: 0,user_id,movie_id,date,rating
0,311074,1228987,2012-11-24,4.0
1,312517,1170358,2013-12-30,1.0
2,311074,1170358,2013-12-30,1.0
3,312517,1228987,2012-11-24,4.0
4,315733,4633694,2019-01-05,3.0


In [32]:
# Concatenate the above dataframes
frames = [user_groa_ratings, user_groa_ratings_rev]
user_groa_ratings = pd.concat(frames)

# Add the source
user_groa_ratings['source'] = 'groa'
user_groa_ratings.head()

Unnamed: 0,user_id,movie_id,date,rating,source
0,312517,1057500,2013-02-11,2.5,groa
1,312517,1051904,2016-05-23,2.0,groa
2,311074,1051904,2016-05-23,2.0,groa
3,311074,1057500,2013-02-11,2.5,groa
4,306626,10925772,2020-03-27,3.5,groa


In [33]:
# Data from user_imdb_ratings
cursor_old = connection_old.cursor()
query = """SELECT a.id, a.user_id, b.movie_id, a.date, a.rating
        FROM user_imdb_ratings a, imdb_movies b
        WHERE (a.name=b.primary_title AND a.year=b.start_year) OR
                (a.name=b.original_title AND a.year=b.start_year)"""
cursor_old.execute(query)
user_imdb_ratings = cursor_old.fetchall()
cursor_old.close()

id = []
user_id = []
movie_id = []
date = []
rating = []

for row in user_imdb_ratings:
    id.append(row[0])
    user_id.append(row[1])
    movie_id.append(row[2])
    date.append(row[3])
    rating.append(row[4])
    
user_imdb_ratings = pd.DataFrame({
    'id': id,
    'user_id': user_id,
    'movie_id': movie_id,
    'date': date,
    'rating': rating
})

# drop ALL duplicate values 
user_imdb_ratings.drop_duplicates(subset =['id'], keep = False, inplace = True)

# Drop the id
user_imdb_ratings = user_imdb_ratings.drop(['id'], axis=1)

# Add the source
user_imdb_ratings['source'] = 'imdb'
user_imdb_ratings.head()

Unnamed: 0,user_id,movie_id,date,rating,source
0,11,1155592,2015-03-31,4.5,imdb
1,11,1119646,2018-12-18,4.5,imdb
2,11,1119646,2018-12-18,4.5,imdb
3,11,1119646,2018-12-18,4.5,imdb
4,11,1119646,2018-12-18,4.5,imdb


In [34]:
# Data from user_letterboxd_ratings
cursor_old = connection_old.cursor()
query = """SELECT a.id, a.user_id, b.movie_id, a.date, a.rating
        FROM user_letterboxd_ratings a, imdb_movies b
        WHERE (a.name=b.primary_title AND a.year=b.start_year) OR
                (a.name=b.original_title AND a.year=b.start_year)"""
cursor_old.execute(query)
user_letterboxd_ratings = cursor_old.fetchall()
cursor_old.close()

id = []
user_id = []
movie_id = []
date = []
rating = []

for row in user_letterboxd_ratings:
    id.append(row[0])
    user_id.append(row[1])
    movie_id.append(row[2])
    date.append(row[3])
    rating.append(row[4])
    
user_letterboxd_ratings = pd.DataFrame({
    'id': id,
    'user_id': user_id,
    'movie_id': movie_id,
    'date': date,
    'rating': rating
})

# drop ALL duplicate values 
user_letterboxd_ratings.drop_duplicates(subset =['id'], keep = False, inplace = True)

# Drop the id
user_letterboxd_ratings = user_letterboxd_ratings.drop(['id'], axis=1)
user_letterboxd_ratings.head()

Unnamed: 0,user_id,movie_id,date,rating
0,15061,114369,2012-09-21,4.0
1,7968,114369,2012-09-20,4.0
2,7968,372784,2012-09-20,3.5
3,7968,456912,2013-02-09,4.0
4,15061,462499,2012-10-18,3.0


In [35]:
# Get the rating from user_letterboxd_reviews
cursor_old = connection_old.cursor()
query = """SELECT a.id, a.user_id, b.movie_id, a.date, a.rating
        FROM user_letterboxd_reviews a, imdb_movies b
        WHERE (a.name=b.primary_title AND a.year=b.start_year) OR
                (a.name=b.original_title AND a.year=b.start_year)"""
cursor_old.execute(query)
user_letterboxd_ratings_rev = cursor_old.fetchall()
cursor_old.close()

id = []
user_id = []
movie_id = []
date = []
rating = []

for row in user_letterboxd_ratings_rev:
    id.append(row[0])
    user_id.append(row[1])
    movie_id.append(row[2])
    date.append(row[3])
    rating.append(row[4])
    
user_letterboxd_ratings_rev = pd.DataFrame({
    'id': id,
    'user_id': user_id,
    'movie_id': movie_id,
    'date': date,
    'rating': rating
})

# drop ALL duplicate values 
user_letterboxd_ratings_rev.drop_duplicates(subset =['id'], keep = False, inplace = True)

# Drop the id
user_letterboxd_ratings_rev = user_letterboxd_ratings_rev.drop(['id'], axis=1)

# Concatenate the above dataframes
frames = [user_letterboxd_ratings, user_letterboxd_ratings_rev]
user_letterboxd_ratings = pd.concat(frames)

# Add the source
user_letterboxd_ratings['source'] = 'letterboxd'
user_letterboxd_ratings.head()

Unnamed: 0,user_id,movie_id,date,rating,source
0,15061,114369,2012-09-21,4.0,letterboxd
1,7968,114369,2012-09-20,4.0,letterboxd
2,7968,372784,2012-09-20,3.5,letterboxd
3,7968,456912,2013-02-09,4.0,letterboxd
4,15061,462499,2012-10-18,3.0,letterboxd


In [36]:
# Shapes of ratings tables
print('shape of user_groa_ratings:', user_groa_ratings.shape)
print('shape of user_imdb_ratings:', user_imdb_ratings.shape)
print('shape of user_letterboxd_ratings:', user_letterboxd_ratings.shape)
print('Total rows:', user_groa_ratings.shape[0] + user_imdb_ratings.shape[0] + user_letterboxd_ratings.shape[0])

shape of user_groa_ratings: (9685, 5)
shape of user_imdb_ratings: (18328, 5)
shape of user_letterboxd_ratings: (117877, 5)
Total rows: 145890


In [37]:
# Concatenate the above three dataframes
frames = [user_groa_ratings, user_imdb_ratings, user_letterboxd_ratings]
user_ratings = pd.concat(frames)
print(user_ratings.shape)
user_ratings.head()

(145890, 5)


Unnamed: 0,user_id,movie_id,date,rating,source
0,312517,1057500,2013-02-11,2.5,groa
1,312517,1051904,2016-05-23,2.0,groa
2,311074,1051904,2016-05-23,2.0,groa
3,311074,1057500,2013-02-11,2.5,groa
4,306626,10925772,2020-03-27,3.5,groa


In [38]:
# Save the dataframe into csv file
user_ratings.to_csv('user_ratings.csv', index=False)

### User reviews

In [182]:
# Data from user_groa_reviews
cursor_old = connection_old.cursor()
query = """SELECT a.id, a.user_id, b.movie_id, a.date, a.review, a.tags
        FROM user_groa_reviews a, imdb_movies b
        WHERE (a.name=b.primary_title AND a.year=b.start_year) OR
                (a.name=b.original_title AND a.year=b.start_year)"""
cursor_old.execute(query)
user_groa_reviews = cursor_old.fetchall()
cursor_old.close()

id = []
user_id = []
movie_id = []
date = []
review_title = []
review_text = []
tags = []
source = []

for row in user_groa_reviews:
    id.append(row[0])
    user_id.append(row[1])
    movie_id.append(row[2]),
    date.append(row[3])
    review_title.append(np.NaN)
    review_text.append(row[4])
    tags.append(row[5])
    source.append('groa')

user_groa_reviews = pd.DataFrame({
    'id': id,
    'user_id': user_id,
    'movie_id': movie_id,
    'date': date,
    'review_title': review_title,
    'review_text': review_text,
    'tags': tags,
    'source': source
})
print(user_groa_reviews.shape)
user_groa_reviews.head()

(315, 8)


Unnamed: 0,id,user_id,movie_id,date,review_title,review_text,tags,source
0,9519,312517,1170358,2013-12-30,,I curled my upper lip so hard that it creased....,,groa
1,7433,311074,1170358,2013-12-30,,I curled my upper lip so hard that it creased....,,groa
2,7364,311074,1250777,2012-09-21,,"A marvel of filmmaking fun, fists, frisson, an...",,groa
3,7398,311074,1228987,2012-11-24,,Basically superfluous. The original was perfec...,,groa
4,9484,312517,1228987,2012-11-24,,Basically superfluous. The original was perfec...,,groa


In [183]:
# Check for duplicates
pd.concat(g for _, g in user_groa_reviews.groupby(['id']) if len(g) > 1)

Unnamed: 0,id,user_id,movie_id,date,review_title,review_text,tags,source
36,7440,311074,1935179,2018-01-11,,"""The deal was for the gun, not for bullets."" ""...",,groa
146,7440,311074,2514592,2018-01-11,,"""The deal was for the gun, not for bullets."" ""...",,groa
35,9525,312517,1935179,2018-01-11,,"""The deal was for the gun, not for bullets."" ""...",,groa
129,9525,312517,2514592,2018-01-11,,"""The deal was for the gun, not for bullets."" ""...",,groa


In [184]:
# drop ALL duplicate values 
user_groa_reviews.drop_duplicates(subset =['id'], keep = False, inplace = True)

# Drop the id
user_groa_reviews = user_groa_reviews.drop(['id'], axis=1)
user_groa_reviews.head()

Unnamed: 0,user_id,movie_id,date,review_title,review_text,tags,source
0,312517,1170358,2013-12-30,,I curled my upper lip so hard that it creased....,,groa
1,311074,1170358,2013-12-30,,I curled my upper lip so hard that it creased....,,groa
2,311074,1250777,2012-09-21,,"A marvel of filmmaking fun, fists, frisson, an...",,groa
3,311074,1228987,2012-11-24,,Basically superfluous. The original was perfec...,,groa
4,312517,1228987,2012-11-24,,Basically superfluous. The original was perfec...,,groa


In [185]:
# Data from user_letterboxd_reviews
cursor_old = connection_old.cursor()
query = """SELECT a.id, a.user_id, b.movie_id, a.date, a.review, a.tags
        FROM user_letterboxd_reviews a, imdb_movies b
        WHERE (a.name=b.primary_title AND a.year=b.start_year) OR
                (a.name=b.original_title AND a.year=b.start_year)"""
cursor_old.execute(query)
user_letterboxd_reviews = cursor_old.fetchall()
cursor_old.close()

id = []
user_id = []
movie_id = []
date = []
review_title = []
review_text = []
tags = []
source = []

for row in user_letterboxd_reviews:
    id.append(row[0])
    user_id.append(row[1])
    movie_id.append(row[2]),
    date.append(row[3])
    review_title.append(np.NaN)
    review_text.append(row[4])
    tags.append(row[5])
    source.append('letterboxd')

user_letterboxd_reviews = pd.DataFrame({
    'id': id,
    'user_id': user_id,
    'movie_id': movie_id,
    'date': date,
    'review_title': review_title,
    'review_text': review_text,
    'tags': tags,
    'source': source
})
print(user_letterboxd_reviews.shape)
user_letterboxd_reviews.head()

(11210, 8)


Unnamed: 0,id,user_id,movie_id,date,review_title,review_text,tags,source
0,593,19614,33870,2012-09-21,,Humphrey Bogart makes this film with the colde...,,letterboxd
1,11375,276525,30287,2019-02-14,,"""I like my convictions undiluted -- same as I ...",,letterboxd
2,12517,306631,30287,2019-02-14,,"""I like my convictions undiluted -- same as I ...",,letterboxd
3,12164,297984,30287,2019-02-14,,"""I like my convictions undiluted -- same as I ...",,letterboxd
4,8133,197098,30287,2019-02-14,,"""I like my convictions undiluted -- same as I ...",,letterboxd


In [186]:
# Check for duplicates
pd.concat(g for _, g in user_letterboxd_reviews.groupby(['id']) if len(g) > 1)

Unnamed: 0,id,user_id,movie_id,date,review_title,review_text,tags,source
7483,91,1,2514592,2018-01-11,,"""The deal was for the gun, not for bullets."" ""...",,letterboxd
8498,91,1,1935179,2018-01-11,,"""The deal was for the gun, not for bullets."" ""...",,letterboxd
7537,187,5028,2514592,2018-01-11,,"""The deal was for the gun, not for bullets."" ""...",,letterboxd
8552,187,5028,1935179,2018-01-11,,"""The deal was for the gun, not for bullets."" ""...",,letterboxd
7536,284,5028,2514592,2018-01-11,,"""The deal was for the gun, not for bullets."" ""...",,letterboxd
...,...,...,...,...,...,...,...,...
8484,12080,294333,1935179,2018-01-11,,"""The deal was for the gun, not for bullets."" ""...",,letterboxd
7466,12338,301078,2514592,2018-01-11,,"""The deal was for the gun, not for bullets."" ""...",,letterboxd
8481,12338,301078,1935179,2018-01-11,,"""The deal was for the gun, not for bullets."" ""...",,letterboxd
7436,12634,48485,2514592,2018-01-11,,"""The deal was for the gun, not for bullets."" ""...",,letterboxd


In [187]:
# drop ALL duplicate values 
user_letterboxd_reviews.drop_duplicates(subset =['id'], keep = False, inplace = True)

# Drop the id
user_letterboxd_reviews = user_letterboxd_reviews.drop(['id'], axis=1)
user_letterboxd_reviews.head()

Unnamed: 0,user_id,movie_id,date,review_title,review_text,tags,source
0,19614,33870,2012-09-21,,Humphrey Bogart makes this film with the colde...,,letterboxd
1,276525,30287,2019-02-14,,"""I like my convictions undiluted -- same as I ...",,letterboxd
2,306631,30287,2019-02-14,,"""I like my convictions undiluted -- same as I ...",,letterboxd
3,297984,30287,2019-02-14,,"""I like my convictions undiluted -- same as I ...",,letterboxd
4,197098,30287,2019-02-14,,"""I like my convictions undiluted -- same as I ...",,letterboxd


In [188]:
# Shapes of user review tables
print('shape of user_groa_reviews:', user_groa_reviews.shape)
print('shape of user_letterboxd_reviews:', user_letterboxd_reviews.shape)
print('Total rows:', user_groa_reviews.shape[0] + user_letterboxd_reviews.shape[0])

shape of user_groa_reviews: (311, 7)
shape of user_letterboxd_reviews: (10996, 7)
Total rows: 11307


In [189]:
# Concatenate the above dataframes
frames = [user_groa_reviews, user_letterboxd_reviews]
user_reviews = pd.concat(frames)
print(user_reviews.shape)
user_reviews.head()

(11307, 7)


Unnamed: 0,user_id,movie_id,date,review_title,review_text,tags,source
0,312517,1170358,2013-12-30,,I curled my upper lip so hard that it creased....,,groa
1,311074,1170358,2013-12-30,,I curled my upper lip so hard that it creased....,,groa
2,311074,1250777,2012-09-21,,"A marvel of filmmaking fun, fists, frisson, an...",,groa
3,311074,1228987,2012-11-24,,Basically superfluous. The original was perfec...,,groa
4,312517,1228987,2012-11-24,,Basically superfluous. The original was perfec...,,groa


In [190]:
# Save the dataframe into csv file
user_reviews.to_csv('user_reviews.csv', index=False)

### User watched

In [191]:
# Data from user_letterboxd_watched
cursor_old = connection_old.cursor()
query = """SELECT a.id, a.user_id, b.movie_id, a.date
        FROM user_letterboxd_watched a, imdb_movies b
        WHERE (a.name=b.primary_title AND a.year=b.start_year) OR
                (a.name=b.original_title AND a.year=b.start_year)"""
cursor_old.execute(query)
user_letterboxd_watched = cursor_old.fetchall()
cursor_old.close()

id = []
user_id = []
movie_id = []
date = []
source = []

for row in user_letterboxd_watched:
    id.append(row[0])
    user_id.append(row[1])
    movie_id.append(row[2]),
    date.append(row[3])
    source.append('letterboxd')

user_letterboxd_watched = pd.DataFrame({
    'id': id,
    'user_id': user_id,
    'movie_id': movie_id,
    'date': date,
    'source': source
})
print(user_letterboxd_watched.shape)
user_letterboxd_watched.head()

(159231, 5)


Unnamed: 0,id,user_id,movie_id,date,source
0,1177,296,371746,2012-09-20,letterboxd
1,1227,296,33467,2012-09-20,letterboxd
2,1261,296,114814,2012-09-20,letterboxd
3,1764,296,443536,2012-11-05,letterboxd
4,1785,296,358082,2012-11-09,letterboxd


In [192]:
# Check for duplicates
pd.concat(g for _, g in user_letterboxd_watched.groupby(['id']) if len(g) > 1)

Unnamed: 0,id,user_id,movie_id,date,source
113951,36,1,0229208,2020-02-13,letterboxd
113952,36,1,0086879,2020-02-13,letterboxd
94814,1181,296,1655442,2012-09-20,letterboxd
94816,1181,296,1825978,2012-09-20,letterboxd
100866,1344,296,0229208,2012-09-20,letterboxd
...,...,...,...,...,...
113693,316679,316396,2514592,2018-12-18,letterboxd
156218,316688,316396,1014759,2018-12-18,letterboxd
156219,316688,316396,2049386,2018-12-18,letterboxd
55389,317020,316396,1895587,2019-03-15,letterboxd


In [193]:
# drop ALL duplicate values 
user_letterboxd_watched.drop_duplicates(subset =['id'], keep = False, inplace = True)

# Drop the id
user_letterboxd_watched = user_letterboxd_watched.drop(['id'], axis=1)
user_letterboxd_watched.head()

Unnamed: 0,user_id,movie_id,date,source
0,296,371746,2012-09-20,letterboxd
1,296,33467,2012-09-20,letterboxd
2,296,114814,2012-09-20,letterboxd
3,296,443536,2012-11-05,letterboxd
4,296,358082,2012-11-09,letterboxd


In [194]:
# Save the dataframe into csv file
user_letterboxd_watched.to_csv('user_watched.csv', index=False)

### User Watchlist

In [3]:
# Data from user_letterboxd_watchlist
cursor_old = connection_old.cursor()
query = """SELECT a.id, a.user_id, b.movie_id, a.date
        FROM user_letterboxd_watchlist a, imdb_movies b
        WHERE (a.name=b.primary_title AND a.year=b.start_year) OR
                (a.name=b.original_title AND a.year=b.start_year)"""
cursor_old.execute(query)
user_letterboxd_watchlist = cursor_old.fetchall()
cursor_old.close()

id = []
user_id = []
movie_id = []
date = []
source = []

for row in user_letterboxd_watchlist:
    id.append(row[0])
    user_id.append(row[1])
    movie_id.append(row[2]),
    date.append(row[3])
    source.append('letterboxd')

user_letterboxd_watchlist = pd.DataFrame({
    'id': id,
    'user_id': user_id,
    'movie_id': movie_id,
    'date': date,
    'source': source
})

# drop ALL duplicate values 
user_letterboxd_watchlist.drop_duplicates(subset =['id'], keep = False, inplace = True)

# Drop the id
user_letterboxd_watchlist = user_letterboxd_watchlist.drop(['id'], axis=1)
user_letterboxd_watchlist.head()

Unnamed: 0,user_id,movie_id,date,source
0,,107688,2012-09-20,letterboxd
1,,49406,2012-09-20,letterboxd
2,,358273,2012-09-20,letterboxd
3,,112864,2012-09-20,letterboxd
4,,395584,2012-09-23,letterboxd


In [4]:
# How many missing values?
print(user_letterboxd_watchlist.shape)
user_letterboxd_watchlist.isnull().sum()

(92549, 4)


user_id     2086
movie_id       0
date           0
source         0
dtype: int64

In [5]:
# Drop values with missing data
user_letterboxd_watchlist = user_letterboxd_watchlist.dropna()

# Convert user_id from float into integer
user_letterboxd_watchlist['user_id'] = user_letterboxd_watchlist['user_id'].astype(int)
user_letterboxd_watchlist.shape

(90463, 4)

In [6]:
# Data from user_groa_watchlist
cursor_old = connection_old.cursor()
query = """SELECT a.id, a.user_id, b.movie_id, a.date
        FROM user_groa_watchlist a, imdb_movies b
        WHERE (a.name=b.primary_title AND a.year=b.start_year) OR
                (a.name=b.original_title AND a.year=b.start_year)"""
cursor_old.execute(query)
user_groa_watchlist = cursor_old.fetchall()
cursor_old.close()

id = []
user_id = []
movie_id = []
date = []
source = []

for row in user_groa_watchlist:
    id.append(row[0])
    user_id.append(row[1])
    movie_id.append(row[2])
    date.append(row[3])
    source.append('groa')

user_groa_watchlist = pd.DataFrame({
    'id': id,
    'user_id': user_id,
    'movie_id': movie_id,
    'date': date,
    'source': source
})

# drop ALL duplicate values 
user_groa_watchlist.drop_duplicates(subset =['id'], keep = False, inplace = True)

# Drop the id
user_groa_watchlist = user_groa_watchlist.drop(['id'], axis=1)
user_groa_watchlist.head()

Unnamed: 0,user_id,movie_id,date,source
0,312517,1069238,2012-10-23,groa
1,311074,1069238,2012-10-23,groa
2,312517,1087578,2013-04-10,groa
3,311074,1087578,2013-04-10,groa
4,312517,1091722,2013-07-17,groa


In [7]:
# Shapes of user review tables
print('shape of user_groa_watchlist:', user_groa_watchlist.shape)
print('shape of user_letterboxd_watchlist:', user_letterboxd_watchlist.shape)
print('Total rows:', user_groa_watchlist.shape[0] + user_letterboxd_watchlist.shape[0])

# Concatenate the dataframes
frames = [user_groa_watchlist, user_letterboxd_watchlist]
user_watchlist = pd.concat(frames)
print(user_watchlist.shape)
user_watchlist.head()

shape of user_groa_watchlist: (2631, 4)
shape of user_letterboxd_watchlist: (90463, 4)
Total rows: 93094
(93094, 4)


Unnamed: 0,user_id,movie_id,date,source
0,312517,1069238,2012-10-23,groa
1,311074,1069238,2012-10-23,groa
2,312517,1087578,2013-04-10,groa
3,311074,1087578,2013-04-10,groa
4,312517,1091722,2013-07-17,groa


In [8]:
# Save the dataframe into csv file
user_watchlist.to_csv('user_watchlist.csv', index=False)

### Movie reviews

In [3]:
query = """SELECT movie_id FROM movies"""
cursor_new = connection_new.cursor()
cursor_new.execute(query)
movie_ids = [row[0] for row in cursor_new.fetchall()]

In [4]:
movie_ids = set(movie_ids)

In [5]:
for movie_id in movie_ids:
    print(type(movie_id))
    break

<class 'str'>


In [6]:
# Data from user_imdb_ratings
cursor_old = connection_old.cursor()
query = """SELECT * FROM imdb_reviews"""
cursor_old.execute(query)
movie_reviews = cursor_old.fetchall()
cursor_old.close()

movie_id = []
review_date = []
user_rating = []
helpful_num = []
helpful_denom = []
user_name = []
review_text = []
review_title = []

In [7]:
movie_reviews_lst = []

for row in movie_reviews:
    
    if row[0] not in movie_ids:
        continue
    
    movie_reviews_lst.append([row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7]])

len(movie_reviews_lst)

3453257

In [8]:
len(movie_reviews)

3462445

In [24]:
from psycopg2.extras import execute_batch

cursor_new = connection_new.cursor()
step = 1000

for ix in range(0, len(movie_reviews), step):
    print(f"doing step: {ix}")
    
    batch = movie_reviews_lst[ix:ix+step]
    
    execute_batch(cursor_new, """
        INSERT INTO movie_reviews (movie_id, review_date, user_rating, helpful_num, helpful_denom, user_name, review_text, review_title)
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s);
        """, batch
    )

doing step: 0
doing step: 1000
doing step: 2000
doing step: 3000
doing step: 4000
doing step: 5000
doing step: 6000
doing step: 7000
doing step: 8000
doing step: 9000
doing step: 10000
doing step: 11000
doing step: 12000
doing step: 13000
doing step: 14000
doing step: 15000
doing step: 16000
doing step: 17000
doing step: 18000
doing step: 19000
doing step: 20000
doing step: 21000
doing step: 22000
doing step: 23000
doing step: 24000
doing step: 25000
doing step: 26000
doing step: 27000
doing step: 28000
doing step: 29000
doing step: 30000
doing step: 31000
doing step: 32000
doing step: 33000
doing step: 34000
doing step: 35000
doing step: 36000
doing step: 37000
doing step: 38000
doing step: 39000
doing step: 40000
doing step: 41000
doing step: 42000
doing step: 43000
doing step: 44000
doing step: 45000
doing step: 46000
doing step: 47000
doing step: 48000
doing step: 49000
doing step: 50000
doing step: 51000
doing step: 52000
doing step: 53000
doing step: 54000
doing step: 55000
doing

In [25]:
cursor_new.close()
connection_new.commit()

### Recommendations tables

In [27]:
# Data from recommendations
cursor_old = connection_old.cursor()
query = "SELECT * FROM recommendations"
cursor_old.execute(query)
recommendations = cursor_old.fetchall()
cursor_old.close()

user_id = []
recommendation_json=[]
date = []
model_type = []

for row in recommendations:
    user_id.append(row[0])
    recommendation_json.append(row[2])
    date.append(row[3])
    model_type.append(row[4])
    
recommendations = pd.DataFrame({
    'user_id': user_id,
    'recommendation_json': recommendation_json,
    'date': date,
    'model_type': model_type
})
print(recommendations.shape)
recommendations.head()

(192, 4)


Unnamed: 0,user_id,recommendation_json,date,model_type
0,277844,"[{'Title': 'Breaking News', 'Year': 2004, 'IMD...",2020-03-13 00:00:00+00:00,ratings model
1,6,"[{'Title': 'La Dolce Vita', 'Year': 1960, 'IMD...",2020-03-13 00:00:00+00:00,ratings model
2,266368,"[{'Title': 'Stranger Than Paradise', 'Year': 1...",2020-03-13 00:00:00+00:00,ratings model
3,284205,"[{'Title': 'Spider-Man', 'Year': 2002, 'IMDB U...",2020-03-13 00:00:00+00:00,ratings model
4,284386,"[{'Title': 'Spider-Man', 'Year': 2002, 'IMDB U...",2020-03-13 00:00:00+00:00,ratings model


In [28]:
# Add new integer recommendation_id
recommendation_id = list(range(1, len(recommendations)+1))
recommendations['recommendation_id'] = recommendation_id
recommendations.head()

Unnamed: 0,user_id,recommendation_json,date,model_type,recommendation_id
0,277844,"[{'Title': 'Breaking News', 'Year': 2004, 'IMD...",2020-03-13 00:00:00+00:00,ratings model,1
1,6,"[{'Title': 'La Dolce Vita', 'Year': 1960, 'IMD...",2020-03-13 00:00:00+00:00,ratings model,2
2,266368,"[{'Title': 'Stranger Than Paradise', 'Year': 1...",2020-03-13 00:00:00+00:00,ratings model,3
3,284205,"[{'Title': 'Spider-Man', 'Year': 2002, 'IMDB U...",2020-03-13 00:00:00+00:00,ratings model,4
4,284386,"[{'Title': 'Spider-Man', 'Year': 2002, 'IMDB U...",2020-03-13 00:00:00+00:00,ratings model,5


In [None]:
# Save data for the new recommendations table
recommendations_table = recommendations[['recommendation_id', 'user_id', 'date', 'model_type']]

# Save the dataframe into csv file
recommendations_table.to_csv('recommendations_table.csv', index=False)

In [None]:
# Get all movie ids
cursor_new = connection_new.cursor()
query = """SELECT movie_id FROM movies"""
cursor_new.execute(query)
movie_ids = [row[0] for row in cursor_new.fetchall()]
movie_ids = set(movie_ids)

In [None]:
# Get data for recommendations_movies table
recommendations_movies = recommendations[['recommendation_id', 'recommendation_json']]

recommendation_id = []
movie_number = []
movie_id = []
num_recs = []

for i in list(range(len(recommendations_movies))):
    for j in list(range(len(recommendation_json[i]))):
        
        if recommendation_json[i][j]['ID'] not in movie_ids:
            continue
        recommendation_id.append(recommendations_movies['recommendation_id'][i])
        movie_number.append(j+1)
        movie_id.append(recommendation_json[i][j]['ID'])
        num_recs.append(len(recommendation_json[i]))

recommendations_movies = pd.DataFrame({
    'recommendation_id': recommendation_id,
    'movie_number': movie_number,
    'movie_id': movie_id,
    'num_recs': num_recs
})

# Save the dataframe into csv file
recommendations_movies.to_csv('recommendations_movies.csv', index=False)