## Importing Basic Libraries 

In [1]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler

#### Import train.csv dataset for Data Analysis and Machine Learning

In [2]:
updatedDF = pd.read_csv('Updated.csv')

FileNotFoundError: ignored

In [None]:
updatedDF.info()

In [None]:
updatedDF

### Regression between dateDifference (Predictor Variable) and videoViewCount (Response Variable)

Predictor Variable: dateDifference
- dateDifference is the numeric variable that measures the data difference between published date of video and date as of 30rd March 2023

Response Variable: videoViewCount
- viewCount is the numeric variable that measures the number of viewers for each particular video in the dataset

#### Part (a) Uni-Variate Linear Regression Model
---
Build a Linear Regression Model to predict:
- videoViewCount (Response) using dateDifference (Predictor) 

> Regression Model: videoViewCount = a*dateDifference + b

Extract the variables with the associated data in respective dataframe

In [None]:
#Response Variable
videoViewCount = pd.DataFrame(updatedDF['videoViewCount'])

In [None]:
#Predictor Variable
dateDifference = pd.DataFrame(updatedDF['dateDifference'])

### Regression via Random Train-Test Split
---
Draw out sub-datasets from main dataset randomly for training model and test model

Thereafter, perform linear regression on the two models

In [None]:
# Import essential models and functions from sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
# Split the Dataset into Train and Test
trainXDD, testXDD, trainYVC, testYVC = train_test_split(dateDifference, videoViewCount, test_size=0.2,random_state=42)

In [None]:
#Performing Linear Regression on the train dataset between (response) videoViewCount and (predictor) dateDifference
linRegModel = LinearRegression() #Creating linear regression object
linRegModel.fit(trainXDD, trainYVC)

In [None]:
#Obtaining Parameters of the Linear Regression Line
#LR equation line: videoViewCount = a* dateDifference + b
#a: co-efficient
#b: y-intercept

print('Intercept of Regression Line \t: b= ', linRegModel.intercept_)
print('Coefficients of Regression Line : a= ', linRegModel.coef_)


### Regression Line Equation
---
videoViewCount = -14.64 *dateDifference + 119339.43

In [None]:
#Through prediction of videoViewCount via dateDifference using the trained model

trainYVC_predicted = linRegModel.predict(trainXDD)

In [None]:
#PLot the linear regression (LR) line using the obtained LR equation
f = plt.figure(figsize=(16,8))
plt.scatter(trainXDD, trainYVC, color ='g')
plt.scatter(trainXDD, trainYVC_predicted, color = 'r')
plt.show()

## Checking the trained Model
---
1. Check the goodness of fit of trained model
2. Check the Prediction Accuracy of the model on the Test Set
3. Print the metrics for Goodness of Fit and Prediction Accuracy appropriate in this scenario

### Checking the goodness of fit of the trained model
--- 
How we check the goodness of fit?
- Explained Variance (R^2)
   - The higher the R^2 value, the better the fit/model
   - 0 <= R^2 <= 1
- Mean Square Error (MSE)
   - The lower the MSE value, the better the fit/model.  

In [None]:
#Calculating the Mean Squared Error (MSE)
def MSE(actual, predicted):
    return np.mean(np.square(np.array(actual)-np.array(predicted)))
#Calculating the Explained Variance
print("Explained Variance (R^2) of model \t:", linRegModel.score(trainXDD, trainYVC))

mse = MSE(trainYVC, trainYVC_predicted)
print("Mean Squared Error (MSE) of model\t:", mse)
print("Root Mean Squared Error (RMSE) of model\t:", np.sqrt(mse))

### Checking the Prediction Accuracy on the Test Set

In [None]:
#Predict videoViewCount (testYVC_predicted) values according to dateDifference test (testXDD)
testYVC_predicted = linRegModel.predict(testXDD)

#Plot the Predictions
f = plt.figure(figsize=(16,8))
plt.scatter(testXDD, testYVC, color='g')
plt.scatter(testXDD, testYVC_predicted, color='r')
plt.show()

In [None]:
#Calculating the Explained Variance (R^2)
print("Explained Variance (R^2) of test set\t\t:", linRegModel.score(testXDD, testYVC))
mse= MSE(testYVC, testYVC_predicted)

print("Mean Squared Error (MSE) of test set\t\t:", mse)
print("Root Mean Squared Error (RMSE) of test set\t:", np.sqrt(mse))

In [None]:
df = pd.DataFrame(updatedDF[['videoViewCount', 'dateDifference']])

In [None]:
df.head()

### Elbow Plot to find the optimal K value for K-means clustering algorithm

In [None]:
SSE = []
kRange = range(1,10)
for k in kRange:
    km = KMeans(n_clusters=k)
    km.fit(df[['videoViewCount','dateDifference']])
    SSE.append(km.inertia_)

In [None]:
plt.xlabel('K')
plt.ylabel('Sum of Squared error')
plt.plot(kRange,SSE)

# Optimal K-Value
---
Optimal K-Value is 3. This means that the data points are best sorted in 3 clusters. 

In [None]:
plt.scatter(df.dateDifference,df['videoViewCount'])
plt.xlabel('dateDifference')
plt.ylabel('videoViewCount')

In [None]:
km = KMeans(n_clusters=3)
y_predicted = km.fit_predict(df[['dateDifference','videoViewCount']])
y_predicted

In [None]:
df['Clusters'] = y_predicted
df.info()
df

In [None]:
Centre = km.cluster_centers_

In [None]:
Centre

In [None]:
#sort dataframes according to clusters
C1 = df[df.Clusters==0]
C2 = df[df.Clusters==1]
C3 = df[df.Clusters==2]

In [None]:
plt.scatter(C1.dateDifference,C1['videoViewCount'],color='green')
plt.scatter(C2.dateDifference,C2['videoViewCount'],color='red')
plt.scatter(C3.dateDifference,C3['videoViewCount'],color='blue')

plt.scatter(km.cluster_centers_[:,0],km.cluster_centers_[:,1],color='yellow',marker='*',label='centroid')
plt.xlabel('dateDifference')
plt.ylabel('videoViewCount')
plt.legend()

### Pre-Processing of DataFrames using Min-Max Scaler


In [None]:
Scaler = MinMaxScaler()
#y-axis: response variable: videoViewCount
Scaler.fit(df[['videoViewCount']])
df['videoViewCount'] = Scaler.transform(df[['videoViewCount']])
#x-axis: predictor variable: dateDifference
Scaler.fit(df[['dateDifference']])
df['dateDifference'] = Scaler.transform(df[['dateDifference']])

In [None]:
df

In [None]:
plt.scatter(df.dateDifference,df['videoViewCount'])
plt.xlabel('dateDifference')
plt.ylabel('videoViewCount')

In [None]:
km = KMeans(n_clusters=3)
y_predicted = km.fit_predict(df[['dateDifference','videoViewCount']])
y_predicted

In [None]:
df['Clusters'] = y_predicted
df.info()
df

In [None]:
Centre = km.cluster_centers_
Centre

In [None]:
#sort dataframes according to clusters
C1 = df[df.Clusters==0]
C2 = df[df.Clusters==1]
C3 = df[df.Clusters==2]

In [None]:
plt.scatter(C1.dateDifference,C1['videoViewCount'],color='green')
plt.scatter(C2.dateDifference,C2['videoViewCount'],color='red')
plt.scatter(C3.dateDifference,C3['videoViewCount'],color='blue')

plt.scatter(km.cluster_centers_[:,0],km.cluster_centers_[:,1],color='yellow',marker='*',label='centroid')
plt.xlabel('dateDifference')
plt.ylabel('videoViewCount')
plt.legend()

In [None]:
SSE = []
kRange = range(1,10)
for k in kRange:
    km = KMeans(n_clusters=k)
    km.fit(df[['videoViewCount','dateDifference']])
    SSE.append(km.inertia_)

In [None]:
plt.xlabel('K')
plt.ylabel('Sum of Squared error')
plt.plot(kRange,SSE)

### K-nearest Neighbour (KNN)

In [None]:
#import library and function KNeighborsClassifier

from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier(n_neighbors=10)


In [None]:
#Response Variable
videoViewCount = pd.DataFrame(updatedDF['videoViewCount'])
#Predictor Variable
dateDifference = pd.DataFrame(updatedDF['dateDifference'])

In [None]:
# Split the Dataset into Train and Test
trainXDD, testXDD, trainYVC, testYVC = train_test_split(dateDifference, videoViewCount, test_size=0.2, random_state = 42)

In [None]:
# Scale the features using StandardScaler
#from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler()
#trainXDD = scaler.fit_transform(trainXDD)
#testXDD = scaler.fit_transform(testXDD)

#trainYVC = scaler.fit_transform(trainYVC)
#testYVC = scaler.fit_transform(testYVC)

trainXDD = trainXDD.values
trainYVC = trainYVC.values

testXDD = testXDD.values
testYVC = testYVC.values

In [None]:
KNN.fit(trainXDD, trainYVC)
YVC_predicted = KNN.predict(testXDD)


In [None]:
print("Accuracy:", KNN.score(testXDD, testYVC))

In [None]:
sb.boxplot(data= updatedDF['dateDifference'], orient='h')

In [None]:
sb.boxplot(data= updatedDF['videoViewCount'], orient='h')

In [None]:
print("Trained Dataset:")
print("No. of rows for trained dataset for dateDifference: ",len(trainXDD))
print("No. of rows for trained dataset for videoViewCount: ",len(trainYVC))
print()
print("Test Dataset:")
print("No. of rows for test dataset for dateDifference: ",len(testXDD))
print("No. of rows for test dataset for videoViewCount: ",len(testYVC))

### Checking the accuracy of the model
- Using Correlation Matrix 
- Using Confusion Matrix and Heat Map
- Classification Report for model

In [None]:
#Correlation Matrix of the two variables dateDifference and videoViewCount

print("Correlation of videoViewCount against dateDifference: \n",np.corrcoef(dateDifference['dateDifference'],videoViewCount['videoViewCount']),"\n")

In [None]:
# Correlation Matrix
NDF = updatedDF[['dateDifference','videoViewCount']]
print(NDF.corr())
# Heatmap of the Correlation Matrix
f = plt.figure(figsize=(10, 10))
sb.heatmap(NDF.corr(), vmin = -1, vmax = 1, linewidths = 1,
           annot = True, fmt = ".2f", annot_kws = {"size": 18}, cmap = "RdBu")

In [None]:
from sklearn.metrics import confusion_matrix
YVCPredicted = KNN.predict(testXDD)
ConMatrix= confusion_matrix(testYVC, YVCPredicted)
ConMatrix

In [None]:
# Heatmap of the Confusion Matrix
#f = plt.figure(figsize=(10, 10))
#sb.heatmap(ConMatrix, vmin = -1, vmax = 1, linewidths = 1,
#           annot = True, fmt = ".2f", annot_kws = {"size": 18}, cmap = "RdBu")
#plt.xlabel('Predicted')
#plt.ylabel('Truth')

In [None]:
#Classification Report for KNN Model
from sklearn.metrics import classification_report

print(classification_report(testYVC, YVCPredicted))

In [None]:
plt.scatter(testXDD, testYVC, color='green', marker='*')
plt.scatter(trainXDD, trainYVC, color='yellow', marker='+')

In [None]:
updatedDF.info()

In [None]:
len(updatedDF['channelViewCount'].unique())

In [None]:
len(updatedDF['dateDifference'].unique())

In [None]:
df = updatedDF[['dateDifference','channelViewCount']]

In [None]:
df

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
#Response 
channelViewCount= pd.DataFrame(df['channelViewCount'])
#Predictor
dateDifference= pd.DataFrame(df['dateDifference'])


In [None]:
trainXDD, testXDD, trainYCVC, testYCVC = train_test_split(dateDifference, channelViewCount, test_size=0.2, random_state =42)

In [None]:
#Performing Linear Regression on the train dataset between (response) channelViewCount and (predictor) dateDifference
reg = LinearRegression() #Creating linear regression object


In [None]:
reg.fit(trainXDD, trainYCVC)

In [None]:
reg.score(trainXDD, trainYCVC)

In [None]:
reg.score(testXDD, testYCVC)

In [None]:
from sklearn import linear_model

In [None]:
LassoReg = linear_model.Lasso(alpha =77, max_iter=100, tol=0.1)

In [None]:
LassoReg.fit(trainXDD, trainYCVC)

In [None]:
LassoReg.score(trainXDD, trainYCVC)

In [None]:
LassoReg.score(testXDD, testYCVC)

In [None]:
#Predict videoViewCount (testYVC_predicted) values according to dateDifference test (testXDD)
testYCVC_predicted = LassoReg.predict(testXDD)

#Plot the Predictions
f = plt.figure(figsize=(16,8))
plt.scatter(testXDD, testYCVC, color='g')
plt.scatter(testXDD, testYCVC_predicted, color='r')
plt.xlabel("Date Difference")
plt.ylabel("Channel View Count")
plt.show()

In [None]:
#Calculating the Explained Variance (R^2)
print("Explained Variance (R^2) of test set\t\t:", reg.score(testXDD, testYCVC))
mse= MSE(testYCVC, testYCVC_predicted)

print("Mean Squared Error (MSE) of test set\t\t:", mse)
print("Root Mean Squared Error (RMSE) of test set\t:", np.sqrt(mse))

In [None]:
Cols = ['channelViewCount','videoViewCount', 'subscriberCount','likes/views', 'likes/dislikes']

In [None]:
updatedDF.info()

In [None]:
UDF = updatedDF[Cols]

In [None]:
UDF.info()

In [None]:
#x = pd.DataFrame(UDF.drop('subscriberCount', axis=1))
x = pd.DataFrame(UDF['channelViewCount'])

In [None]:
y = pd.DataFrame(UDF['subscriberCount'])

In [None]:
trainX, testX, trainYSC, testYSC = train_test_split(x, y, test_size=0.2, random_state =42)

In [None]:
#Performing Linear Regression on the train dataset between (response) channelViewCount and (predictor) dateDifference
reg = LinearRegression() #Creating linear regression object
reg.fit(trainX, trainYSC)

reg.score(trainX, trainYSC)

In [None]:
reg.score(testX, testYSC)

In [None]:
LassoReg = linear_model.Lasso(alpha =77, max_iter=100, tol=0.1)
LassoReg.fit(trainX, trainYSC)

In [None]:
LassoReg.score(trainX, trainYSC)

In [None]:
LassoReg.score(testX, testYSC)

In [None]:
#Predict videoViewCount (testYVC_predicted) values according to dateDifference test (testXDD)
testYSC_predicted = LassoReg.predict(testX)

#Plot the Predictions
f = plt.figure(figsize=(16,8))
plt.scatter(testX, testYSC, color='g')
plt.scatter(testX, testYSC_predicted, color='r')
plt.xlabel("Channel View Count")
plt.ylabel("Subscriber Count")
plt.show()

In [None]:
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier(n_neighbors=3)
trainX = trainX.values
trainYSC = trainYSC.values

testX = testX.values
testYSC = testYSC.values

In [None]:
KNN.fit(trainX, trainYSC)
YSC_predicted = KNN.predict(testX)

In [None]:
from sklearn.metrics import confusion_matrix
YSCPredicted = KNN.predict(testX)


In [None]:
#Classification Report for KNN Model
from sklearn.metrics import classification_report

print(classification_report(testYSC, YSCPredicted))