In [17]:
import pandas as pd
import csv
import numpy as np

In [18]:
column_types = {
    'isAdult': float,
    'startYear': float,
    'endYear': float,
    'runtimeMinutes': float,
    'tconst': str,
    'titleType': str,
    'primaryTitle': str,
    'originalTitle': str,
    'genres': str
}

titles_df = pd.read_csv("data.tsv", 
                        dtype=column_types,
                        na_values=r'\N',
                        sep="\t",
                        quoting=csv.QUOTE_NONE)

titles_df = titles_df.dropna(subset=['isAdult','runtimeMinutes','startYear'])

In [19]:
print(titles_df.shape)
titles_df.head()

(2949978, 9)


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0.0,1894.0,,1.0,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0.0,1892.0,,5.0,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0.0,1892.0,,4.0,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0.0,1892.0,,12.0,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0.0,1893.0,,1.0,"Comedy,Short"


In [20]:
titles_df['titleType'].unique()

array(['short', 'movie', 'tvShort', 'tvMovie', 'tvSeries', 'tvEpisode',
       'tvMiniSeries', 'video', 'tvSpecial', 'videoGame'], dtype=object)

In [21]:
tv_types = [
    'tvMovie',
    'tvSeries',
    'tvEpisode',
    'tvShort',
    'tvMiniSeries',
    'tvSpecial'
]
titles_df = titles_df.loc[titles_df['titleType'].isin(tv_types)]

In [22]:
titles_df = titles_df.loc[titles_df['primaryTitle'] == titles_df['originalTitle']]
titles_df

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
28755,tt0029270,tvShort,Much Ado About Nothing,Much Ado About Nothing,0.0,1937.0,,10.0,"Comedy,Romance,Short"
29765,tt0030298,tvMovie,Julius Caesar,Julius Caesar,0.0,1938.0,,101.0,"Drama,History"
34971,tt0035599,tvSeries,Voice of Firestone Televues,Voice of Firestone Televues,0.0,1943.0,1947.0,15.0,
37600,tt0038276,tvSeries,You Are an Artist,You Are an Artist,0.0,1946.0,1955.0,15.0,Talk-Show
38056,tt0038738,tvMovie,A Midsummer Night's Dream,A Midsummer Night's Dream,0.0,1946.0,,150.0,"Drama,Fantasy"
...,...,...,...,...,...,...,...,...,...
10408643,tt9916690,tvEpisode,Horrid Henry Delivers the Milk,Horrid Henry Delivers the Milk,0.0,2012.0,,10.0,"Adventure,Animation,Comedy"
10408644,tt9916692,tvMovie,Teatroteka: Czlowiek bez twarzy,Teatroteka: Czlowiek bez twarzy,0.0,2015.0,,66.0,Drama
10408677,tt9916766,tvEpisode,Episode #10.15,Episode #10.15,0.0,2019.0,,43.0,"Family,Game-Show,Reality-TV"
10408712,tt9916840,tvEpisode,Horrid Henry's Comic Caper,Horrid Henry's Comic Caper,0.0,2014.0,,11.0,"Adventure,Animation,Comedy"


In [23]:
def principal_component_analysis(dataframe, key_list):
    new_dataframe = dataframe[key_list]

    # Compute the mean of the data
    mean = np.mean(new_dataframe, axis=0)

    # Center the data by subtracting the mean
    centered_data = new_dataframe - mean

    # Compute the covariance matrix
    cov_matrix = np.cov(centered_data, rowvar=False)

    # Get the eigenvalues and eigenvectors
    eigen_values, eigen_vectors = np.linalg.eig(cov_matrix)

    # Sort the eigenvectors by decreasing eigenvalues
    sorted_index = np.argsort(eigen_values)[::-1]
    sorted_eigenvalue = eigen_values[sorted_index]
    sorted_eigenvectors = eigen_vectors[:, sorted_index]

    # Transform the data
    return np.dot(centered_data, sorted_eigenvectors)

In [24]:
foo = principal_component_analysis(titles_df,["isAdult",'runtimeMinutes', "startYear"])
foo

array([[-3.07612038e+01,  6.63303286e+01, -3.07568865e-02],
       [ 6.02295864e+01,  6.46944937e+01, -4.26119969e-02],
       [-2.58032499e+01,  6.02955403e+01, -2.59529832e-02],
       ...,
       [ 1.66502853e+00, -1.58982216e+01,  3.98558333e-02],
       [-3.02992533e+01, -1.06747467e+01,  3.97592572e-02],
       [-3.12992288e+01, -1.06677592e+01,  3.98996169e-02]])