In [2]:
import pandas as pd
import csv
import numpy as np

In [3]:
column_types = {
    'isAdult': float,
    'startYear': float,
    'endYear': float,
    'runtimeMinutes': float,
    'tconst': str,
    'titleType': str,
    'primaryTitle': str,
    'originalTitle': str,
    'genres': str
}

titles_df = pd.read_csv("data.tsv", 
                        dtype=column_types,
                        na_values=r'\N',
                        sep="\t",
                        quoting=csv.QUOTE_NONE)

In [4]:
print(titles_df.shape)
titles_df.head()

(10408721, 9)


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0.0,1894.0,,1.0,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0.0,1892.0,,5.0,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0.0,1892.0,,4.0,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0.0,1892.0,,12.0,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0.0,1893.0,,1.0,"Comedy,Short"


In [5]:
titles_df['titleType'].unique()

array(['short', 'movie', 'tvShort', 'tvMovie', 'tvSeries', 'tvEpisode',
       'tvMiniSeries', 'tvSpecial', 'video', 'videoGame', 'tvPilot'],
      dtype=object)

In [6]:
tv_types = [
    'tvMovie',
    'tvSeries',
    'tvEpisode',
    'tvShort',
    'tvMiniSeries',
    'tvSpecial'
]
titles_df = titles_df.loc[titles_df['titleType'].isin(tv_types)]

In [7]:
titles_df = titles_df.loc[titles_df['primaryTitle'] == titles_df['originalTitle']]
titles_df

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
28755,tt0029270,tvShort,Much Ado About Nothing,Much Ado About Nothing,0.0,1937.0,,10.0,"Comedy,Romance,Short"
29765,tt0030298,tvMovie,Julius Caesar,Julius Caesar,0.0,1938.0,,101.0,"Drama,History"
34971,tt0035599,tvSeries,Voice of Firestone Televues,Voice of Firestone Televues,0.0,1943.0,1947.0,15.0,
37600,tt0038276,tvSeries,You Are an Artist,You Are an Artist,0.0,1946.0,1955.0,15.0,Talk-Show
37633,tt0038309,tvMovie,As You Like It,As You Like It,0.0,1946.0,,,Drama
...,...,...,...,...,...,...,...,...,...
10408715,tt9916846,tvEpisode,Episode #3.18,Episode #3.18,0.0,2009.0,,,"Action,Drama,Family"
10408716,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0.0,2009.0,,,"Action,Drama,Family"
10408717,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0.0,2010.0,,,"Action,Drama,Family"
10408718,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0.0,2010.0,,,"Action,Drama,Family"


In [8]:
titles_df.loc[titles_df['startYear'].idxmin()]

tconst                 tt19595898
titleType            tvMiniSeries
primaryTitle      Solser en Hesse
originalTitle     Solser en Hesse
isAdult                         0
startYear                    1900
endYear                      1906
runtimeMinutes                NaN
genres                     Comedy
Name: 5049777, dtype: object

This cannot be true, because TV did not exist back then.

In [9]:
titles_df['startYear'].quantile(.01)

1957.0

We'll be generous, and keep any TV-related stuff after 1945.

In [10]:
titles_df = titles_df[titles_df['startYear'] >= 1945]
titles_df.columns

Index(['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult',
       'startYear', 'endYear', 'runtimeMinutes', 'genres'],
      dtype='object')

For those entries with no endYear, we'll use the current year.

In [11]:
import datetime

# Assuming titles_df is your DataFrame
cur_year = int(datetime.datetime.now().year)

# Create a copy of the slice to avoid SettingWithCopyWarning
titles_df = titles_df.copy()

# Use .loc to replace NaN values in 'endYear'
titles_df.loc[:, 'endYear'].fillna(cur_year, inplace=True)


In [12]:
titles_df['genres']

37600                        Talk-Show
37633                            Drama
38056                    Drama,Fantasy
38434                 Family,Game-Show
38435                           Family
                       ...            
10408715           Action,Drama,Family
10408716           Action,Drama,Family
10408717           Action,Drama,Family
10408718           Action,Drama,Family
10408720    Adventure,Animation,Comedy
Name: genres, Length: 7172463, dtype: object

In [13]:
one_genre = titles_df[titles_df['genres'].str.count(',') == 0]
one_genre

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
37600,tt0038276,tvSeries,You Are an Artist,You Are an Artist,0.0,1946.0,1955.0,15.0,Talk-Show
37633,tt0038309,tvMovie,As You Like It,As You Like It,0.0,1946.0,2024.0,,Drama
38435,tt0039121,tvSeries,Birthday Party,Birthday Party,0.0,1947.0,1949.0,30.0,Family
38756,tt0039445,tvMovie,Hamlet Part 1,Hamlet Part 1,0.0,1947.0,2024.0,88.0,Drama
38929,tt0039618,tvMovie,The Merchant of Venice,The Merchant of Venice,0.0,1947.0,2024.0,90.0,Drama
...,...,...,...,...,...,...,...,...,...
10408666,tt9916742,tvEpisode,"From Victoria College, Jersey","From Victoria College, Jersey",0.0,1973.0,2024.0,,Family
10408667,tt9916744,tvEpisode,De Volta à Era Disco Music,De Volta à Era Disco Music,0.0,2019.0,2024.0,,Talk-Show
10408668,tt9916746,tvEpisode,HK 21 - Episode 4,HK 21 - Episode 4,0.0,2012.0,2024.0,,Sport
10408681,tt9916776,tvEpisode,Talent Coaching with IMOR's Bianca Desmore Mit...,Talent Coaching with IMOR's Bianca Desmore Mit...,0.0,2019.0,2024.0,,Talk-Show


In [14]:
genre_counts = one_genre['genres'].value_counts()
genre_counts

Drama          733546
Comedy         560731
Talk-Show      557928
News           451920
Documentary    360043
Reality-TV     294148
Adult          199118
Game-Show      148400
Family         137225
Sport          108117
Music           95567
Romance         66244
Animation       54973
Crime           38708
Adventure       22179
History         20346
Thriller        16597
Short           15556
Biography       15011
Horror          14818
Fantasy         14757
Action          13002
Sci-Fi          12656
Mystery         12001
Musical          9646
Western          8955
War              1057
Name: genres, dtype: int64

In [15]:
top_genres = one_genre[one_genre['genres'].isin(genre_counts.head(5).index.tolist())]
top_genres = top_genres[top_genres['titleType'] == 'tvMovie']
top_genres

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
37633,tt0038309,tvMovie,As You Like It,As You Like It,0.0,1946.0,2024.0,,Drama
38756,tt0039445,tvMovie,Hamlet Part 1,Hamlet Part 1,0.0,1947.0,2024.0,88.0,Drama
38929,tt0039618,tvMovie,The Merchant of Venice,The Merchant of Venice,0.0,1947.0,2024.0,90.0,Drama
39363,tt0040060,tvMovie,Wit and Wisdom,Wit and Wisdom,0.0,1948.0,2024.0,45.0,Comedy
39804,tt0040510,tvMovie,The Tragedy of King Lear Part 1,The Tragedy of King Lear Part 1,0.0,1948.0,2024.0,108.0,Drama
...,...,...,...,...,...,...,...,...,...
10407776,tt9914738,tvMovie,Les oubliés de la Mer de Chine,Les oubliés de la Mer de Chine,0.0,1981.0,2024.0,45.0,Documentary
10408019,tt9915312,tvMovie,Fuckboi Romeo,Fuckboi Romeo,0.0,2018.0,2024.0,,Comedy
10408123,tt9915546,tvMovie,Hot Explicit Ladies,Hot Explicit Ladies,0.0,2017.0,2024.0,,Comedy
10408535,tt9916460,tvMovie,Pink Taxi,Pink Taxi,0.0,2019.0,2024.0,,Comedy


In [16]:
top_genres.columns

Index(['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult',
       'startYear', 'endYear', 'runtimeMinutes', 'genres'],
      dtype='object')

In [18]:
import numpy as np
import pandas as pd

def distance(l1: np.ndarray, l2: np.ndarray) -> float:
    assert len(l1) == len(l2)
    return np.sum((l1 - l2) ** 2)

def clustering(df: pd.DataFrame, n_clusters: int) -> np.ndarray:
    columns = df.columns
    
    # Assignment of random coordinates to cluster centers
    coord_means = np.random.rand(n_clusters, len(columns))
    close_mean1 = np.full(df.shape[0], -1)
    close_mean2 = np.full(df.shape[0], -2)
    
    # The learning process stops when not a single point has changed its cluster during the iteration
    while not np.array_equal(close_mean1, close_mean2):
        close_mean1 = close_mean2.copy()
        close_mean2 = np.full(df.shape[0], -1)
        
        # Finding the nearest cluster center for each point
        for i in range(df.shape[0]):
            dist = np.array([distance(df.iloc[i, :], coord_mean) for coord_mean in coord_means])
            close_mean2[i] = np.argmin(dist)
        
        # Calculation of new coordinates for cluster centers
        for j in range(n_clusters):
            cluster_data = df[close_mean2 == j]
            if len(cluster_data) != 0:
                coord_means[j, :] = cluster_data.mean().values
            else:
                coord_means[j, :] = np.random.rand(len(columns))
    
    return close_mean2

In [19]:
data_for_clustering = top_genres[['startYear', 'endYear', 'runtimeMinutes']].values
# Convert NaN values to 0 in 'runtimeMinutes' column (you may adjust this based on your data)
data_for_clustering[:, 2] = np.nan_to_num(data_for_clustering[:, 2])

# Train k-means clustering
n_clusters = 5
cluster_labels = clustering(pd.DataFrame(data_for_clustering), n_clusters)

# Add cluster labels to the original DataFrame
top_genres['cluster'] = cluster_labels

# Display the result
print(top_genres[['tconst', 'titleType', 'genres', 'cluster']])

KeyboardInterrupt: 