In [1]:
import pandas as pd
import csv
import numpy as np

In [2]:
column_types = {
    'isAdult': float,
    'startYear': float,
    'endYear': float,
    'runtimeMinutes': float,
    'tconst': str,
    'titleType': str,
    'primaryTitle': str,
    'originalTitle': str,
    'genres': str
}

titles_df = pd.read_csv("data.tsv", 
                        dtype=column_types,
                        na_values=r'\N',
                        sep="\t",
                        quoting=csv.QUOTE_NONE)

In [3]:
print(titles_df.shape)
titles_df.head()

(10408721, 9)


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0.0,1894.0,,1.0,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0.0,1892.0,,5.0,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0.0,1892.0,,4.0,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0.0,1892.0,,12.0,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0.0,1893.0,,1.0,"Comedy,Short"


In [4]:
titles_df['titleType'].unique()

array(['short', 'movie', 'tvShort', 'tvMovie', 'tvSeries', 'tvEpisode',
       'tvMiniSeries', 'tvSpecial', 'video', 'videoGame', 'tvPilot'],
      dtype=object)

In [5]:
tv_types = [
    'tvMovie',
    'tvSeries',
    'tvEpisode',
    'tvShort',
    'tvMiniSeries',
    'tvSpecial'
]
titles_df = titles_df.loc[titles_df['titleType'].isin(tv_types)]

In [6]:
titles_df = titles_df.loc[titles_df['primaryTitle'] == titles_df['originalTitle']]
titles_df

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
28755,tt0029270,tvShort,Much Ado About Nothing,Much Ado About Nothing,0.0,1937.0,,10.0,"Comedy,Romance,Short"
29765,tt0030298,tvMovie,Julius Caesar,Julius Caesar,0.0,1938.0,,101.0,"Drama,History"
34971,tt0035599,tvSeries,Voice of Firestone Televues,Voice of Firestone Televues,0.0,1943.0,1947.0,15.0,
37600,tt0038276,tvSeries,You Are an Artist,You Are an Artist,0.0,1946.0,1955.0,15.0,Talk-Show
37633,tt0038309,tvMovie,As You Like It,As You Like It,0.0,1946.0,,,Drama
...,...,...,...,...,...,...,...,...,...
10408715,tt9916846,tvEpisode,Episode #3.18,Episode #3.18,0.0,2009.0,,,"Action,Drama,Family"
10408716,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0.0,2009.0,,,"Action,Drama,Family"
10408717,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0.0,2010.0,,,"Action,Drama,Family"
10408718,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0.0,2010.0,,,"Action,Drama,Family"


In [7]:
titles_df.loc[titles_df['startYear'].idxmin()]

tconst                 tt19595898
titleType            tvMiniSeries
primaryTitle      Solser en Hesse
originalTitle     Solser en Hesse
isAdult                         0
startYear                    1900
endYear                      1906
runtimeMinutes                NaN
genres                     Comedy
Name: 5049777, dtype: object

This cannot be true, because TV did not exist back then.

In [8]:
titles_df['startYear'].quantile(.01)

1957.0

We'll be generous, and keep any TV-related stuff after 1945.

In [9]:
titles_df = titles_df[titles_df['startYear'] >= 1945]
titles_df.columns

Index(['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult',
       'startYear', 'endYear', 'runtimeMinutes', 'genres'],
      dtype='object')

For those entries with no endYear, we'll use the current year.

In [10]:
import datetime
cur_year = int(datetime.datetime.now().year)
titles_df = titles_df.copy()
titles_df.loc[:, 'endYear'].fillna(cur_year, inplace=True)

In [11]:
titles_df['genres']

37600                        Talk-Show
37633                            Drama
38056                    Drama,Fantasy
38434                 Family,Game-Show
38435                           Family
                       ...            
10408715           Action,Drama,Family
10408716           Action,Drama,Family
10408717           Action,Drama,Family
10408718           Action,Drama,Family
10408720    Adventure,Animation,Comedy
Name: genres, Length: 7172463, dtype: object

In [12]:
one_genre = titles_df.copy()
one_genre['genres'] = one_genre['genres'].fillna('')
multiple_genres_mask = one_genre['genres'].str.contains(',')
one_genre.loc[multiple_genres_mask, 'genres'] = one_genre[multiple_genres_mask]['genres'].apply(lambda x: np.random.choice(x.split(',')))
one_genre = one_genre.rename(columns={'genres': 'genre'})
one_genre

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genre
37600,tt0038276,tvSeries,You Are an Artist,You Are an Artist,0.0,1946.0,1955.0,15.0,Talk-Show
37633,tt0038309,tvMovie,As You Like It,As You Like It,0.0,1946.0,2024.0,,Drama
38056,tt0038738,tvMovie,A Midsummer Night's Dream,A Midsummer Night's Dream,0.0,1946.0,2024.0,150.0,Drama
38434,tt0039120,tvSeries,Americana,Americana,0.0,1947.0,1949.0,30.0,Family
38435,tt0039121,tvSeries,Birthday Party,Birthday Party,0.0,1947.0,1949.0,30.0,Family
...,...,...,...,...,...,...,...,...,...
10408715,tt9916846,tvEpisode,Episode #3.18,Episode #3.18,0.0,2009.0,2024.0,,Action
10408716,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0.0,2009.0,2024.0,,Action
10408717,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0.0,2010.0,2024.0,,Family
10408718,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0.0,2010.0,2024.0,,Action


In [13]:
import numpy as np

def kmeans(X, k, max_iters=100, tol=1e-4):
    """
    Perform k-means clustering on the input data.

    Parameters:
    - X: Input data (numpy array)
    - k: Number of clusters
    - max_iters: Maximum number of iterations (default: 100)
    - tol: Tolerance to declare convergence (default: 1e-4)

    Returns:
    - labels: Cluster labels for each data point
    """

    # Initialize centroids randomly
    centroids = X[np.random.choice(X.shape[0], k, replace=False)]

    for iter in range(max_iters):
        # Assign each data point to the nearest centroid
        distances = np.linalg.norm(X[:, np.newaxis] - centroids, axis=2)
        labels = np.argmin(distances, axis=1)

        # Update centroids
        new_centroids = np.array([X[labels == i].mean(axis=0) for i in range(k)])

        # Check for convergence
        if np.linalg.norm(new_centroids - centroids) < tol:
            break

        centroids = new_centroids

    return labels

# Continue with k-means clustering without standardization
features = one_genre[['isAdult', 'runtimeMinutes', 'genre']]

# Drop entries with null values
features = features.dropna()
encoded_categorical = pd.get_dummies(features['genre'], drop_first=True).values
X = np.concatenate([features[['isAdult', 'runtimeMinutes']].values, encoded_categorical], axis=1)
k = 6  # Number of clusters
labels = kmeans(X, k)

In [14]:
features['cluster_label'] = labels
features['titleType'] = one_genre.loc[features.index, 'titleType']
features[['titleType', 'cluster_label']]

Unnamed: 0,titleType,cluster_label
37600,tvSeries,5
38056,tvMovie,4
38434,tvSeries,3
38435,tvSeries,3
38436,tvSeries,3
...,...,...
10408643,tvEpisode,5
10408644,tvMovie,0
10408677,tvEpisode,2
10408712,tvEpisode,5


Observations of clusters:

tvEpisode: Most instances are assigned to clusters 2 and 3. Clusters 0 and 5 also have a significant amount of instances.

tvMiniSeries: Evenly spread instances among all clusters except 1.

tvMovie: Most instances are in cluster 0, with cluster 2 having less than half that.

tvSeries:Mainly assigned to cluster 3, but cluster 2 also has many.

tvShort: Cluster 5 has the most instances, with cluster 3 having the second most.

tvSpecial: Spread across multiple clusters, with higher counts in cluster 0.