In [None]:
import pandas as pd
import numpy as np

In [None]:
#Q1= reading files
akas = pd.read_csv('title.akas.tsv', delimiter = "\t", dtype = 'object') #read dtypes as object to optimise memory use
akas.head()
basics = pd.read_csv('title.basics.tsv', delimiter = '\t', dtype ='object')
basics.head(1)
ratings = pd.read_csv('title.ratings.tsv', delimiter = '\t', dtype='object')
ratings.head(1)

In [None]:
#Q2 =Dropping duplicates
akas.drop_duplicates()
basics.drop_duplicates()
ratings.drop_duplicates()
print("done")

In [None]:
#Q3 = Using merge
#Q3.1 Inner merge
newdf = pd.merge(basics, ratings, on ='tconst', how ='inner')
newdf.head(1)
movies_mergedInner = pd.merge(akas, newdf, left_on='titleId', right_on='tconst', how ='inner')
movies_mergedInner.info()
#number of lines 1686547, 19 columns
#Q3.1 Outer merge
df = pd.merge(basics, ratings, on ='tconst', how ='outer')
movies_mergedOuter = pd.merge(akas, df, left_on='titleId', right_on='tconst', how ='outer')
movies_mergedOuter.info()
#number of lines 3419932, 19 columns
#Q3.2 Unique method
unique= movies_mergedInner['titleType'].nunique()
print('Number of unique values in titleType = ', unique)


In [None]:
#Q4
df_new = movies_mergedOuter[(movies_mergedOuter.titleType=='movie') & 
                            ((movies_mergedOuter.region=='US') |
                             (movies_mergedOuter.language =='en'))]
print("done")

In [None]:
#Q5 add log10votes to datatframe
#Changed object type to integer to enable analysis
df_new.numVotes = df_new.numVotes.astype(str).astype(int)
df_new['log10Votes'] = np.log10(df_new['numVotes'])
print("done")

In [None]:
#Q6 Lower case of genres column
df_new.genres = df_new.genres.str.lower()
df_new.head(1)

In [None]:
#Q6.1 group by genre and show highest 10 by mean of log10VOtes
df1 = df_new.groupby('genres')['log10Votes'].mean().sort_values(ascending=False)
df1.head(10)

In [None]:
#Q6.2 group by genre and show highest 10 by mean of averageRating
#Changed object type to integer to enable downstream analysis
df_new["averageRating"] = df_new["averageRating"].astype(str).astype(float)
df2 = df_new.groupby('genres')['averageRating'].mean().sort_values(ascending=False)
df2.head(10)

In [None]:
#Q7 
#Groupby averageRating
df2 = df_new.groupby('averageRating').mean().reset_index()
#Draw scatterplot of averageRating and log10Votes
import matplotlib.pyplot as plt
plt.scatter(x = df2['averageRating'], y = df2['log10Votes'])
plt.title('Scatterplot of averageRating vs log10Votes')
plt.xlabel('averageRating')
plt.ylabel('log10Votes')
plt.show()

In [None]:
#Q8.1 Linear regression with sklearn
from sklearn.linear_model import LinearRegression

#create numpy arrays
X = np.array(df2['averageRating']).reshape(-1,1) 
Y= np.array(df2['log10Votes']).reshape(-1,1)

#create regression object
reg = LinearRegression(fit_intercept=True)

#fit regression line
reg.fit(X, Y)
xfit = np.linspace(1, 10, 1000)
yfit = reg.predict(xfit[:, np.newaxis])

#plot regression line
plt.scatter(X, Y)
plt.plot(xfit, yfit, color='red') #fitted line
plt.title('Linear regression of averageRating on log10Votes')
plt.xlabel('averageRating')
plt.ylabel('log10Votes')

#print out values for intercept and slope
print('Intercept is ;', reg.intercept_)
print('Slope is;', reg.coef_)

In [None]:
#Q8.2 linear regression using scipy
from scipy import stats
X = np.array(df2['averageRating']).reshape(1,-1) 
Y= np.array(df2['log10Votes']).reshape(1,-1)
res = stats.linregress(X, Y)
#create array of predicted values
ypred = (res.intercept + (res.slope*X))

#reshape array for plotting
yft =np.array(ypred).reshape(-1,1)
xvals=np.array(X).reshape(-1,1)

#draw plots
plt.scatter(X, Y,)
plt.plot(xvals, yft, color='red') #fitted line
plt.title('Linear regression of averageRating on log10Votes')
plt.xlabel('averageRating')
plt.ylabel('log10Votes')
plt.show()

#print out values for slope and intercept
print('Intercept is ;', res.intercept)
print('Slope is ;', res.slope)


In [None]:
#Q8.3 Linear regression with pytorch
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


#Splitting data into training and testing sets
from sklearn.model_selection import train_test_split
x1 = np.array(df2['averageRating']).reshape(-1,1) 
y1= np.array(df2['log10Votes']).reshape(-1,1)
X_train, X_test, y_train, y_test=train_test_split(x1,y1, test_size=0.2, random_state=0)
X_train = X_train.astype(np.float32)
Y_train = y_train.astype(np.float32)

# Setting the seed or random_state for reproducibility
torch.manual_seed(1)

# Defining the model architecture.
class LinearRegressionModel(torch.nn.Module): 
    def __init__(self): 
        super(LinearRegressionModel, self).__init__() 
        self.linear = torch.nn.Linear(1, 1)  # this layer of the model has a single neuron, that takes in one scalar input and gives out one scalar output. 
    def forward(self, x): 
        y_pred = self.linear(x) 
        return y_pred 

# Creating the model
model = LinearRegressionModel()
print(model)

# Defining the Loss Function
criterion = torch.nn.MSELoss()

# Defining the Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr = 0.0005) 

#COnvert data to Tensors
data_x = torch.tensor([[x] for x in X_train], dtype = torch.float)
data_y = torch.tensor([[y] for y in Y_train], dtype = torch.float)

#Training the model
losses = []         # to keep track of the epoch losses 
slope_list = []     # to keep track of the slope learnt by the model
intercept_list = [] # to keep track of the intercept learnt by the model

EPOCHS = 11000
print('\nTRAINING...')
for epoch in range(EPOCHS):
    # Clear the gradients of the optimizer before running the back-propagation
    optimizer.zero_grad() 
    
    # Feeding the input data in the model and getting out the predictions
    pred_y = model(data_x)

    # Calculating the loss using the model's predictions and the real y values
    loss = criterion(pred_y, data_y) 

    # Back-Propagation
    loss.backward() 
    
    # Updating all the trainable parameters
    optimizer.step()
    
    # Appending the loss.item() 
    losses.append(loss.item())
    
    # Appending the learnt slope and intercept   
    slope_list.append(model.linear.weight.item())
    intercept_list.append(model.linear.bias.item())
    
    # Print out the losses after every 1100 epochs
    if (epoch)%1000 == 0:
        print('loss: ', loss.item())
        
# Plotting the epoch losses (curve must flatten out if training epochs enough)
plt.plot(losses)
plt.title('Loss VS Epoch')
plt.xlabel('#Epoch')
plt.ylabel('Loss')
plt.show()

#Printing parameters
print("Intercept is  ;", intercept_list[-1])
print("Slope is ;", slope_list[-1])

#Plotting linear regression line
#making predictions
pred_y = model(data_x)

#creating numpy array and reshaping
preds= pred_y.detach().numpy()
predy= np.array(preds).reshape(-1,1)
dataX=data_x.detach().numpy()
dat=np.array(dataX).reshape(-1,1)
#drawing plot
plt.scatter(X_train, y_train)
plt.plot(dat, predy, color='red')
plt.title('Linear regression of averageRating on log10Votes')
plt.xlabel("averageRating")
plt.ylabel('log10Votes')
plt.show()


In [None]:
#Comparing Interecept and slope from all three models
print('Sklearn Intercept is ;', reg.intercept_)
print('Scipy Intercept is ;', res.intercept)
print("Pytorch Intercept is  ;", intercept_list[-1])
print('SkLearn Slope is;', reg.coef_)
print('SciPy Slope is ;', res.slope)
print("PyTorch Slope is ;", slope_list[-1])
#slight variation in output from PyTorch due to differences in how the regression model is implemented by deeplearning.