In [1]:
import pandas as pd
import csv
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

In [2]:
column_types = {
    'isAdult': float,
    'startYear': float,
    'endYear': float,
    'runtimeMinutes': float,
    'tconst': str,
    'titleType': str,
    'primaryTitle': str,
    'originalTitle': str,
    'genres': str
}

titles_df = pd.read_csv("data.tsv", 
                        dtype=column_types,
                        na_values=r'\N',
                        sep="\t",
                        quoting=csv.QUOTE_NONE)

titles_df = titles_df.dropna(subset=['isAdult','runtimeMinutes','startYear'])

In [3]:
print(titles_df.shape)
titles_df.head()

(2949978, 9)


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0.0,1894.0,,1.0,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0.0,1892.0,,5.0,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0.0,1892.0,,4.0,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0.0,1892.0,,12.0,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0.0,1893.0,,1.0,"Comedy,Short"


In [4]:
titles_df['titleType'].unique()

array(['short', 'movie', 'tvShort', 'tvMovie', 'tvSeries', 'tvEpisode',
       'tvMiniSeries', 'video', 'tvSpecial', 'videoGame'], dtype=object)

In [5]:
tv_types = [
    'tvMovie',
    'tvSeries',
    'tvEpisode',
    'tvShort',
    'tvMiniSeries',
    'tvSpecial'
]
titles_df = titles_df.loc[titles_df['titleType'].isin(tv_types)]

In [6]:
titles_df = titles_df.loc[titles_df['primaryTitle'] == titles_df['originalTitle']]
titles_df

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
28755,tt0029270,tvShort,Much Ado About Nothing,Much Ado About Nothing,0.0,1937.0,,10.0,"Comedy,Romance,Short"
29765,tt0030298,tvMovie,Julius Caesar,Julius Caesar,0.0,1938.0,,101.0,"Drama,History"
34971,tt0035599,tvSeries,Voice of Firestone Televues,Voice of Firestone Televues,0.0,1943.0,1947.0,15.0,
37600,tt0038276,tvSeries,You Are an Artist,You Are an Artist,0.0,1946.0,1955.0,15.0,Talk-Show
38056,tt0038738,tvMovie,A Midsummer Night's Dream,A Midsummer Night's Dream,0.0,1946.0,,150.0,"Drama,Fantasy"
...,...,...,...,...,...,...,...,...,...
10408643,tt9916690,tvEpisode,Horrid Henry Delivers the Milk,Horrid Henry Delivers the Milk,0.0,2012.0,,10.0,"Adventure,Animation,Comedy"
10408644,tt9916692,tvMovie,Teatroteka: Czlowiek bez twarzy,Teatroteka: Czlowiek bez twarzy,0.0,2015.0,,66.0,Drama
10408677,tt9916766,tvEpisode,Episode #10.15,Episode #10.15,0.0,2019.0,,43.0,"Family,Game-Show,Reality-TV"
10408712,tt9916840,tvEpisode,Horrid Henry's Comic Caper,Horrid Henry's Comic Caper,0.0,2014.0,,11.0,"Adventure,Animation,Comedy"


In [7]:
def principal_component_analysis(dataframe, key_list):
    new_dataframe = dataframe[key_list]

    # Compute the mean and std of the data
    mean = np.mean(new_dataframe, axis=0)
    std = np.std(new_dataframe, axis=0)
    # Standardize the data by subtracting the mean and dividing by std
    standardized_data = (new_dataframe - mean) / std
    
    # Compute the covariance matrix
    cov_matrix = np.cov(standardized_data, rowvar=False)

    # Get the eigenvalues and eigenvectors
    eigen_values, eigen_vectors = np.linalg.eig(cov_matrix)

    # Sort the eigenvectors by decreasing eigenvalues
    sorted_index = np.argsort(eigen_values)[::-1]
    sorted_eigenvalue = eigen_values[sorted_index]
    sorted_eigenvectors = eigen_vectors[:, sorted_index]

    # Transform the data
    return np.dot(standardized_data, sorted_eigenvectors)

In [8]:
foo = principal_component_analysis(titles_df,["isAdult",'runtimeMinutes', "startYear"])
foo

array([[-2.24930501, -1.92286087, -2.18237185],
       [-2.85453809,  0.03628494, -2.51660433],
       [-2.07057413, -1.70349914, -1.98260552],
       ...,
       [ 0.44219416,  0.32168503,  0.69178961],
       [ 0.48927506, -0.45459475,  0.63876866],
       [ 0.49631782, -0.47591732,  0.64284475]])

In [14]:
class GaussianNaiveBayes: 
    def __init__(self, x, y, log=False) :
        self.data = np.concatenate((np.array(x), np.array(y).reshape(-1, 1)), axis=1)
        self.n_features = x.shape[1]
        self.classes = set(y)
        self.log = log
    
    def fit(self) :
        dimensions = (len(self.classes),2)
        self.mean, self.std, self.len = np.zeros(dimensions),  np.zeros(dimensions), np.zeros(len(self.classes))

        for i in self.classes :
            j = int(i)
            d = self.data[self.data[:,-1]==i]
            self.mean[j] = np.mean(d, axis=0)[:-1]
            self.std[j] = np.std(d, axis=0)[:-1]
            self.len[j] = len(d)
    
    def get_probability(self, inp, mean, std) :
        res = (1 / (np.sqrt(2 * np.pi) * std)) * np.exp(-(((inp-mean)**2)/(2*(std**2))))
        if self.log : 
            return np.log(1+res)
        return res
    
    def predict(self, test_set) :
        results = np.zeros(len(test_set)) #pd.DataFrame(columns=['isAdult'])
        test_arr = test_set.to_numpy()
        t = 0
        for index in range(len(test_arr)):
            inp = test_arr[index]
            pred_class, pred_prob = -1 , 0
            for i in self.classes: 
                j = int(i)
                probs = self.get_probability(inp, self.mean[j], self.std[j])
                class_prob = np.prod(probs)
                if class_prob > pred_prob : 
                    pred_class, pred_prob = i , class_prob
            results[index] = pred_class
            t += 1
        return results


In [15]:
X_train, X_test, y_train, y_test = train_test_split(titles_df[["startYear", "runtimeMinutes"]], titles_df["isAdult"], test_size=0.5, random_state=0)
gnb1 = GaussianNaiveBayes(X_train,y_train)
gnb1.fit()
y_pred1 = gnb1.predict(X_test)

In [16]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(titles_df[["startYear", "runtimeMinutes"]], titles_df["isAdult"], test_size=0.5, random_state=0)

gnb2 = GaussianNB()
y_pred2 = gnb2.fit(X_train1, y_train1).predict(X_test1)

In [20]:
correct1 = 0
y_test_arr = y_test.to_numpy()

for i in range(len(y_pred1)):
    if y_pred1[i] == y_test_arr[i]:
        correct1 += 1

correct2 = 0
for i in range(len(y_pred2)):
    if y_pred2[i] == y_test_arr[i]:
        correct2 += 1

print("Accuracy rate of our model :", round( (correct1 * 100 / len(y_pred1)) , 2), "%")
print("Accuracy rate of the prebuilt model :", round( (correct2 * 100 / len(y_pred2)) , 2), "%")

Accuracy rate of our model : 46.82 %
Accuracy rate of the prebuilt model : 97.46 %
