In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
DATA_PATH = r"/content/drive/MyDrive/Colab Notebooks/Deep Learning/movies.csv"


In [5]:
import pandas as pd
import numpy as np
import re
import random
from sklearn.model_selection import train_test_split


SEED = 42
random.seed(SEED)
np.random.seed(SEED)

In [8]:
df = pd.read_csv(DATA_PATH)

print("Original shape:", df.shape)
print(df.columns.tolist())

Original shape: (4803, 24)
['index', 'budget', 'genres', 'homepage', 'id', 'keywords', 'original_language', 'original_title', 'overview', 'popularity', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'vote_average', 'vote_count', 'cast', 'crew', 'director']


In [7]:
df = pd.read_csv(DATA_PATH)
print(df.columns.tolist())

['index', 'budget', 'genres', 'homepage', 'id', 'keywords', 'original_language', 'original_title', 'overview', 'popularity', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'vote_average', 'vote_count', 'cast', 'crew', 'director']


In [9]:
allowed_cols = ["overview", "tagline", "keywords", "genres", "vote_average"]
df = df[allowed_cols].copy()

print("Shape after filtering:", df.shape)
df.head()

Shape after filtering: (4803, 5)


Unnamed: 0,overview,tagline,keywords,genres,vote_average
0,"In the 22nd century, a paraplegic Marine is di...",Enter the World of Pandora.,culture clash future space war space colony so...,Action Adventure Fantasy Science Fiction,7.2
1,"Captain Barbossa, long believed to be dead, ha...","At the end of the world, the adventure begins.",ocean drug abuse exotic island east india trad...,Adventure Fantasy Action,6.9
2,A cryptic message from Bond’s past sends him o...,A Plan No One Escapes,spy based on novel secret agent sequel mi6,Action Adventure Crime,6.3
3,Following the death of District Attorney Harve...,The Legend Ends,dc comics crime fighter terrorist secret ident...,Action Crime Drama Thriller,7.6
4,"John Carter is a war-weary, former military ca...","Lost in our world, found in another.",based on novel mars medallion space travel pri...,Action Adventure Science Fiction,6.1


In [10]:
df = df.dropna(subset=["genres", "vote_average"])

df["overview"] = df["overview"].fillna("")
df["tagline"] = df["tagline"].fillna("")
df["keywords"] = df["keywords"].fillna("")

print("Shape after cleaning:", df.shape)

Shape after cleaning: (4775, 5)


In [11]:
import re

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"\d+", "", text)
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

for col in ["overview", "tagline", "keywords"]:
    df[col] = df[col].apply(preprocess_text)

df.head()

Unnamed: 0,overview,tagline,keywords,genres,vote_average
0,in the nd century a paraplegic marine is dispa...,enter the world of pandora,culture clash future space war space colony so...,Action Adventure Fantasy Science Fiction,7.2
1,captain barbossa long believed to be dead has ...,at the end of the world the adventure begins,ocean drug abuse exotic island east india trad...,Adventure Fantasy Action,6.9
2,a cryptic message from bonds past sends him on...,a plan no one escapes,spy based on novel secret agent sequel mi,Action Adventure Crime,6.3
3,following the death of district attorney harve...,the legend ends,dc comics crime fighter terrorist secret ident...,Action Crime Drama Thriller,7.6
4,john carter is a warweary former military capt...,lost in our world found in another,based on novel mars medallion space travel pri...,Action Adventure Science Fiction,6.1


In [12]:
def tokenize(text):
    return text.split()

for col in ["overview", "tagline", "keywords"]:
    df[col + "_tokens"] = df[col].apply(tokenize)

df.head()

Unnamed: 0,overview,tagline,keywords,genres,vote_average,overview_tokens,tagline_tokens,keywords_tokens
0,in the nd century a paraplegic marine is dispa...,enter the world of pandora,culture clash future space war space colony so...,Action Adventure Fantasy Science Fiction,7.2,"[in, the, nd, century, a, paraplegic, marine, ...","[enter, the, world, of, pandora]","[culture, clash, future, space, war, space, co..."
1,captain barbossa long believed to be dead has ...,at the end of the world the adventure begins,ocean drug abuse exotic island east india trad...,Adventure Fantasy Action,6.9,"[captain, barbossa, long, believed, to, be, de...","[at, the, end, of, the, world, the, adventure,...","[ocean, drug, abuse, exotic, island, east, ind..."
2,a cryptic message from bonds past sends him on...,a plan no one escapes,spy based on novel secret agent sequel mi,Action Adventure Crime,6.3,"[a, cryptic, message, from, bonds, past, sends...","[a, plan, no, one, escapes]","[spy, based, on, novel, secret, agent, sequel,..."
3,following the death of district attorney harve...,the legend ends,dc comics crime fighter terrorist secret ident...,Action Crime Drama Thriller,7.6,"[following, the, death, of, district, attorney...","[the, legend, ends]","[dc, comics, crime, fighter, terrorist, secret..."
4,john carter is a warweary former military capt...,lost in our world found in another,based on novel mars medallion space travel pri...,Action Adventure Science Fiction,6.1,"[john, carter, is, a, warweary, former, milita...","[lost, in, our, world, found, in, another]","[based, on, novel, mars, medallion, space, tra..."


In [13]:
from sklearn.model_selection import train_test_split


SEED = 42
random.seed(SEED)
np.random.seed(SEED)


train_df, temp_df = train_test_split(
    df,
    test_size=0.30,
    random_state=SEED,
    shuffle=True
)


val_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,
    random_state=SEED,
    shuffle=True
)

print("Train:", len(train_df))
print("Val:", len(val_df))
print("Test:", len(test_df))

Train: 3342
Val: 716
Test: 717


In [14]:
GLOVE_PATH = r"/content/drive/MyDrive/Colab Notebooks/Deep Learning/glove.6B.100d.txt"

In [18]:
EMBEDDING_DIM = 100

def load_glove_safe(path):
    embeddings = {}

    with open(path, 'r', encoding='utf8') as f:
        for line in f:
            values = line.strip().split()


            if len(values) != EMBEDDING_DIM + 1:
                continue

            word = values[0]

            try:
                vector = np.array(values[1:], dtype=np.float32)
                embeddings[word] = vector
            except ValueError:
                continue

    return embeddings

In [19]:
print("Loading GloVe safely...")
glove_embeddings = load_glove_safe(GLOVE_PATH)
print("Loaded words:", len(glove_embeddings))

Loading GloVe safely...
Loaded words: 1287614


In [20]:
print("Embedding dimension:", len(glove_embeddings["the"]))

Embedding dimension: 100


In [21]:
all_tokens = set()

for col in ["overview_tokens", "tagline_tokens", "keywords_tokens"]:
    for tokens in train_df[col]:
        all_tokens.update(tokens)

print("Total unique tokens in training set:", len(all_tokens))

Total unique tokens in training set: 20554


In [22]:
covered_tokens = [token for token in all_tokens if token in glove_embeddings]

coverage = len(covered_tokens) / len(all_tokens) * 100

print(f"GloVe Coverage: {coverage:.2f}%")
print("Covered tokens:", len(covered_tokens))

GloVe Coverage: 92.77%
Covered tokens: 19068


In [87]:
from sklearn.feature_extraction.text import TfidfVectorizer

def fit_tfidf(text_series):
    vectorizer = TfidfVectorizer(
        max_features=20000,
        lowercase=False,
        tokenizer=lambda x: x.split(),
        preprocessor=None,
        token_pattern=None
    )

    tfidf_matrix = vectorizer.fit_transform(text_series)
    return vectorizer, tfidf_matrix

In [24]:
def build_tfidf_glove_embeddings(text_series, vectorizer):

    tfidf_matrix = vectorizer.transform(text_series)
    feature_names = vectorizer.get_feature_names_out()

    embeddings = np.zeros((len(text_series), EMBEDDING_DIM))

    for i in range(tfidf_matrix.shape[0]):

        row = tfidf_matrix[i]
        indices = row.indices
        data = row.data

        weighted_sum = np.zeros(EMBEDDING_DIM)
        weight_total = 0.0

        for idx, weight in zip(indices, data):
            word = feature_names[idx]

            if word in glove_embeddings:
                weighted_sum += weight * glove_embeddings[word]
                weight_total += weight

        if weight_total > 0:
            embeddings[i] = weighted_sum / weight_total
        else:
            embeddings[i] = np.zeros(EMBEDDING_DIM)

    return embeddings

In [25]:
overview_vectorizer, _ = fit_tfidf(train_df["overview"])

In [26]:
X_train_overview = build_tfidf_glove_embeddings(train_df["overview"], overview_vectorizer)
X_val_overview   = build_tfidf_glove_embeddings(val_df["overview"], overview_vectorizer)
X_test_overview  = build_tfidf_glove_embeddings(test_df["overview"], overview_vectorizer)

print("Train embedding shape:", X_train_overview.shape)

Train embedding shape: (3342, 100)


In [27]:
import torch
import torch.nn as nn

X_train = torch.tensor(X_train_overview, dtype=torch.float32)
X_val   = torch.tensor(X_val_overview, dtype=torch.float32)
X_test  = torch.tensor(X_test_overview, dtype=torch.float32)

y_train = torch.tensor(train_df["vote_average"].values, dtype=torch.float32).view(-1, 1)
y_val   = torch.tensor(val_df["vote_average"].values, dtype=torch.float32).view(-1, 1)
y_test  = torch.tensor(test_df["vote_average"].values, dtype=torch.float32).view(-1, 1)

In [28]:
class RegressionModel(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.model(x)

model = RegressionModel(input_dim=100)

In [29]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [30]:
EPOCHS = 20

for epoch in range(EPOCHS):

    model.train()
    optimizer.zero_grad()

    outputs = model(X_train)
    loss = criterion(outputs, y_train)

    loss.backward()
    optimizer.step()

    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val)
        val_loss = criterion(val_outputs, y_val)

    print(f"Epoch {epoch+1}/{EPOCHS} | "
          f"Train Loss: {loss.item():.4f} | "
          f"Val Loss: {val_loss.item():.4f}")

Epoch 1/20 | Train Loss: 39.6201 | Val Loss: 39.1584
Epoch 2/20 | Train Loss: 39.2810 | Val Loss: 38.8156
Epoch 3/20 | Train Loss: 38.9510 | Val Loss: 38.5039
Epoch 4/20 | Train Loss: 38.6447 | Val Loss: 38.2234
Epoch 5/20 | Train Loss: 38.3663 | Val Loss: 37.9513
Epoch 6/20 | Train Loss: 38.0988 | Val Loss: 37.6719
Epoch 7/20 | Train Loss: 37.8279 | Val Loss: 37.3785
Epoch 8/20 | Train Loss: 37.5456 | Val Loss: 37.0692
Epoch 9/20 | Train Loss: 37.2492 | Val Loss: 36.7426
Epoch 10/20 | Train Loss: 36.9371 | Val Loss: 36.3984
Epoch 11/20 | Train Loss: 36.6084 | Val Loss: 36.0357
Epoch 12/20 | Train Loss: 36.2619 | Val Loss: 35.6530
Epoch 13/20 | Train Loss: 35.8965 | Val Loss: 35.2513
Epoch 14/20 | Train Loss: 35.5119 | Val Loss: 34.8324
Epoch 15/20 | Train Loss: 35.1093 | Val Loss: 34.3935
Epoch 16/20 | Train Loss: 34.6878 | Val Loss: 33.9283
Epoch 17/20 | Train Loss: 34.2422 | Val Loss: 33.4319
Epoch 18/20 | Train Loss: 33.7673 | Val Loss: 32.9004
Epoch 19/20 | Train Loss: 33.2595 | V

In [31]:
model.eval()
with torch.no_grad():
    test_predictions = model(X_test)
    test_mse = criterion(test_predictions, y_test).item()
    test_rmse = test_mse ** 0.5

print("\nTest MSE:", round(test_mse, 4))
print("Test RMSE:", round(test_rmse, 4))


Test MSE: 32.3793
Test RMSE: 5.6903


In [32]:
tagline_vectorizer, _ = fit_tfidf(train_df["tagline"])

In [33]:
X_train_tagline = build_tfidf_glove_embeddings(train_df["tagline"], tagline_vectorizer)
X_val_tagline   = build_tfidf_glove_embeddings(val_df["tagline"], tagline_vectorizer)
X_test_tagline  = build_tfidf_glove_embeddings(test_df["tagline"], tagline_vectorizer)

print("Tagline Train Shape:", X_train_tagline.shape)

Tagline Train Shape: (3342, 100)


In [34]:
X_train = torch.tensor(X_train_tagline, dtype=torch.float32)
X_val   = torch.tensor(X_val_tagline, dtype=torch.float32)
X_test  = torch.tensor(X_test_tagline, dtype=torch.float32)

In [35]:
y_train = torch.tensor(train_df["vote_average"].values, dtype=torch.float32).view(-1, 1)
y_val   = torch.tensor(val_df["vote_average"].values, dtype=torch.float32).view(-1, 1)
y_test  = torch.tensor(test_df["vote_average"].values, dtype=torch.float32).view(-1, 1)

In [36]:
model = RegressionModel(input_dim=100)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [37]:
EPOCHS = 20

for epoch in range(EPOCHS):

    model.train()
    optimizer.zero_grad()

    outputs = model(X_train)
    loss = criterion(outputs, y_train)

    loss.backward()
    optimizer.step()

    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val)
        val_loss = criterion(val_outputs, y_val)

    print(f"Epoch {epoch+1}/{EPOCHS} | "
          f"Train Loss: {loss.item():.4f} | "
          f"Val Loss: {val_loss.item():.4f}")

Epoch 1/20 | Train Loss: 39.4015 | Val Loss: 39.0499
Epoch 2/20 | Train Loss: 39.0960 | Val Loss: 38.7536
Epoch 3/20 | Train Loss: 38.8100 | Val Loss: 38.4707
Epoch 4/20 | Train Loss: 38.5387 | Val Loss: 38.2048
Epoch 5/20 | Train Loss: 38.2831 | Val Loss: 37.9558
Epoch 6/20 | Train Loss: 38.0428 | Val Loss: 37.7206
Epoch 7/20 | Train Loss: 37.8146 | Val Loss: 37.4909
Epoch 8/20 | Train Loss: 37.5919 | Val Loss: 37.2603
Epoch 9/20 | Train Loss: 37.3694 | Val Loss: 37.0267
Epoch 10/20 | Train Loss: 37.1437 | Val Loss: 36.7871
Epoch 11/20 | Train Loss: 36.9108 | Val Loss: 36.5361
Epoch 12/20 | Train Loss: 36.6669 | Val Loss: 36.2702
Epoch 13/20 | Train Loss: 36.4089 | Val Loss: 35.9881
Epoch 14/20 | Train Loss: 36.1358 | Val Loss: 35.6891
Epoch 15/20 | Train Loss: 35.8470 | Val Loss: 35.3739
Epoch 16/20 | Train Loss: 35.5426 | Val Loss: 35.0420
Epoch 17/20 | Train Loss: 35.2220 | Val Loss: 34.6928
Epoch 18/20 | Train Loss: 34.8845 | Val Loss: 34.3267
Epoch 19/20 | Train Loss: 34.5307 | V

In [38]:
model.eval()
with torch.no_grad():
    test_predictions = model(X_test)
    test_mse = criterion(test_predictions, y_test).item()
    test_rmse = test_mse ** 0.5

print("\nTagline Test MSE:", round(test_mse, 4))
print("Tagline Test RMSE:", round(test_rmse, 4))


Tagline Test MSE: 34.0986
Tagline Test RMSE: 5.8394


In [39]:
keywords_vectorizer, _ = fit_tfidf(train_df["keywords"])

In [40]:
X_train_keywords = build_tfidf_glove_embeddings(train_df["keywords"], keywords_vectorizer)
X_val_keywords   = build_tfidf_glove_embeddings(val_df["keywords"], keywords_vectorizer)
X_test_keywords  = build_tfidf_glove_embeddings(test_df["keywords"], keywords_vectorizer)

print("Keywords Train Shape:", X_train_keywords.shape)

Keywords Train Shape: (3342, 100)


In [41]:
X_train = torch.tensor(X_train_keywords, dtype=torch.float32)
X_val   = torch.tensor(X_val_keywords, dtype=torch.float32)
X_test  = torch.tensor(X_test_keywords, dtype=torch.float32)

In [42]:
y_train = torch.tensor(train_df["vote_average"].values, dtype=torch.float32).view(-1, 1)
y_val   = torch.tensor(val_df["vote_average"].values, dtype=torch.float32).view(-1, 1)
y_test  = torch.tensor(test_df["vote_average"].values, dtype=torch.float32).view(-1, 1)

In [43]:
model = RegressionModel(input_dim=100)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [44]:
EPOCHS = 20

for epoch in range(EPOCHS):

    model.train()
    optimizer.zero_grad()

    outputs = model(X_train)
    loss = criterion(outputs, y_train)

    loss.backward()
    optimizer.step()

    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val)
        val_loss = criterion(val_outputs, y_val)

    print(f"Epoch {epoch+1}/{EPOCHS} | "
          f"Train Loss: {loss.item():.4f} | "
          f"Val Loss: {val_loss.item():.4f}")

Epoch 1/20 | Train Loss: 38.1238 | Val Loss: 37.8016
Epoch 2/20 | Train Loss: 37.8662 | Val Loss: 37.5324
Epoch 3/20 | Train Loss: 37.6073 | Val Loss: 37.2634
Epoch 4/20 | Train Loss: 37.3492 | Val Loss: 36.9970
Epoch 5/20 | Train Loss: 37.0946 | Val Loss: 36.7354
Epoch 6/20 | Train Loss: 36.8451 | Val Loss: 36.4796
Epoch 7/20 | Train Loss: 36.6007 | Val Loss: 36.2287
Epoch 8/20 | Train Loss: 36.3604 | Val Loss: 35.9768
Epoch 9/20 | Train Loss: 36.1190 | Val Loss: 35.7192
Epoch 10/20 | Train Loss: 35.8727 | Val Loss: 35.4515
Epoch 11/20 | Train Loss: 35.6174 | Val Loss: 35.1711
Epoch 12/20 | Train Loss: 35.3503 | Val Loss: 34.8753
Epoch 13/20 | Train Loss: 35.0682 | Val Loss: 34.5620
Epoch 14/20 | Train Loss: 34.7695 | Val Loss: 34.2296
Epoch 15/20 | Train Loss: 34.4523 | Val Loss: 33.8767
Epoch 16/20 | Train Loss: 34.1150 | Val Loss: 33.5021
Epoch 17/20 | Train Loss: 33.7565 | Val Loss: 33.1048
Epoch 18/20 | Train Loss: 33.3756 | Val Loss: 32.6826
Epoch 19/20 | Train Loss: 32.9707 | V

In [45]:
model.eval()
with torch.no_grad():
    test_predictions = model(X_test)
    test_mse = criterion(test_predictions, y_test).item()
    test_rmse = test_mse ** 0.5

print("\nKeywords Test MSE:", round(test_mse, 4))
print("Keywords Test RMSE:", round(test_rmse, 4))


Keywords Test MSE: 32.4401
Keywords Test RMSE: 5.6956


In [51]:
train_df["genres"].head(5)

Unnamed: 0,genres
908,Thriller Science Fiction Mystery
2476,Drama
4110,History Documentary
4083,Drama
835,Comedy Family


In [52]:
train_df["genres"].unique()[:20]

array(['Thriller Science Fiction Mystery', 'Drama', 'History Documentary',
       'Comedy Family', 'Animation Family Adventure Science Fiction',
       'Comedy Documentary', 'Action Crime Fantasy',
       'Action Comedy Thriller Crime Mystery', 'Drama History',
       'Action Drama History Thriller War', 'Documentary',
       'Action Crime Drama Thriller', 'Drama Romance', 'Horror Mystery',
       'Comedy Romance', 'Action Thriller', 'Drama Comedy',
       'Adventure Comedy Fantasy', 'Drama Thriller Romance',
       'Action Thriller War'], dtype=object)

In [53]:
KNOWN_GENRES = [
    "Action", "Adventure", "Animation", "Comedy", "Crime", "Documentary",
    "Drama", "Family", "Fantasy", "History", "Horror",
    "Music", "Mystery", "Romance", "Science Fiction",
    "TV Movie", "Thriller", "War", "Western"
]

In [54]:
def extract_genres_from_string(genre_str):
    found = []
    for genre in KNOWN_GENRES:
        if genre in genre_str:
            found.append(genre)
    return found

In [55]:
train_df["genre_list"] = train_df["genres"].apply(extract_genres_from_string)
val_df["genre_list"]   = val_df["genres"].apply(extract_genres_from_string)
test_df["genre_list"]  = test_df["genres"].apply(extract_genres_from_string)

In [56]:
train_df["genre_list"].head()

Unnamed: 0,genre_list
908,"[Mystery, Science Fiction, Thriller]"
2476,[Drama]
4110,"[Documentary, History]"
4083,[Drama]
835,"[Comedy, Family]"


In [57]:
all_genres = set()

for genres in train_df["genre_list"]:
    all_genres.update(genres)

genre_to_idx = {g: i for i, g in enumerate(sorted(all_genres))}
idx_to_genre = {i: g for g, i in genre_to_idx.items()}

NUM_GENRES = len(genre_to_idx)

print("Number of genres:", NUM_GENRES)
print("Genres:", sorted(all_genres))

Number of genres: 19
Genres: ['Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery', 'Romance', 'Science Fiction', 'TV Movie', 'Thriller', 'War', 'Western']


In [59]:
import numpy as np

def encode_genres(genre_lists):
    multi_hot = np.zeros((len(genre_lists), NUM_GENRES))

    for i, genres in enumerate(genre_lists):
        for g in genres:
            if g in genre_to_idx:
                multi_hot[i, genre_to_idx[g]] = 1

    return multi_hot

In [60]:
y_train_multi = encode_genres(train_df["genre_list"])
y_val_multi   = encode_genres(val_df["genre_list"])
y_test_multi  = encode_genres(test_df["genre_list"])

In [61]:
import torch

y_train = torch.tensor(y_train_multi, dtype=torch.float32)
y_val   = torch.tensor(y_val_multi, dtype=torch.float32)
y_test  = torch.tensor(y_test_multi, dtype=torch.float32)

In [62]:
X_train = torch.tensor(X_train_overview, dtype=torch.float32)
X_val   = torch.tensor(X_val_overview, dtype=torch.float32)
X_test  = torch.tensor(X_test_overview, dtype=torch.float32)

In [63]:
import torch.nn as nn

class MultiLabelModel(nn.Module):
    def __init__(self, input_dim, num_labels):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, num_labels)
        )

    def forward(self, x):
        return self.model(x)

model = MultiLabelModel(input_dim=100, num_labels=NUM_GENRES)

In [64]:
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [65]:
EPOCHS = 20

for epoch in range(EPOCHS):

    model.train()
    optimizer.zero_grad()

    outputs = model(X_train)
    loss = criterion(outputs, y_train)

    loss.backward()
    optimizer.step()

    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val)
        val_loss = criterion(val_outputs, y_val)

    print(f"Epoch {epoch+1}/{EPOCHS} | "
          f"Train Loss: {loss.item():.4f} | "
          f"Val Loss: {val_loss.item():.4f}")

Epoch 1/20 | Train Loss: 0.6875 | Val Loss: 0.6816
Epoch 2/20 | Train Loss: 0.6820 | Val Loss: 0.6762
Epoch 3/20 | Train Loss: 0.6769 | Val Loss: 0.6710
Epoch 4/20 | Train Loss: 0.6720 | Val Loss: 0.6657
Epoch 5/20 | Train Loss: 0.6669 | Val Loss: 0.6601
Epoch 6/20 | Train Loss: 0.6616 | Val Loss: 0.6539
Epoch 7/20 | Train Loss: 0.6557 | Val Loss: 0.6470
Epoch 8/20 | Train Loss: 0.6491 | Val Loss: 0.6393
Epoch 9/20 | Train Loss: 0.6417 | Val Loss: 0.6306
Epoch 10/20 | Train Loss: 0.6335 | Val Loss: 0.6212
Epoch 11/20 | Train Loss: 0.6245 | Val Loss: 0.6108
Epoch 12/20 | Train Loss: 0.6145 | Val Loss: 0.5995
Epoch 13/20 | Train Loss: 0.6037 | Val Loss: 0.5873
Epoch 14/20 | Train Loss: 0.5919 | Val Loss: 0.5742
Epoch 15/20 | Train Loss: 0.5792 | Val Loss: 0.5602
Epoch 16/20 | Train Loss: 0.5657 | Val Loss: 0.5455
Epoch 17/20 | Train Loss: 0.5514 | Val Loss: 0.5302
Epoch 18/20 | Train Loss: 0.5364 | Val Loss: 0.5143
Epoch 19/20 | Train Loss: 0.5208 | Val Loss: 0.4981
Epoch 20/20 | Train L

In [66]:
from sklearn.metrics import f1_score, hamming_loss

model.eval()
with torch.no_grad():
    logits = model(X_test)
    probs = torch.sigmoid(logits)

    predictions = (probs > 0.5).int().cpu().numpy()
    true_labels = y_test.cpu().numpy()

In [67]:
micro_f1 = f1_score(true_labels, predictions, average="micro")
macro_f1 = f1_score(true_labels, predictions, average="macro")
hamming  = hamming_loss(true_labels, predictions)

print("\nOverview Results:")
print("Micro-F1:", round(micro_f1, 4))
print("Macro-F1:", round(macro_f1, 4))
print("Hamming Loss:", round(hamming, 4))


Overview Results:
Micro-F1: 0.2472
Macro-F1: 0.0331
Hamming Loss: 0.135


In [68]:
X_train = torch.tensor(X_train_keywords, dtype=torch.float32)
X_val   = torch.tensor(X_val_keywords, dtype=torch.float32)
X_test  = torch.tensor(X_test_keywords, dtype=torch.float32)

In [69]:
y_train = torch.tensor(y_train_multi, dtype=torch.float32)
y_val   = torch.tensor(y_val_multi, dtype=torch.float32)
y_test  = torch.tensor(y_test_multi, dtype=torch.float32)

In [70]:
model = MultiLabelModel(input_dim=100, num_labels=NUM_GENRES)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [71]:
EPOCHS = 20

for epoch in range(EPOCHS):

    model.train()
    optimizer.zero_grad()

    outputs = model(X_train)
    loss = criterion(outputs, y_train)

    loss.backward()
    optimizer.step()

    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val)
        val_loss = criterion(val_outputs, y_val)

    print(f"Epoch {epoch+1}/{EPOCHS} | "
          f"Train Loss: {loss.item():.4f} | "
          f"Val Loss: {val_loss.item():.4f}")

Epoch 1/20 | Train Loss: 0.6894 | Val Loss: 0.6847
Epoch 2/20 | Train Loss: 0.6851 | Val Loss: 0.6804
Epoch 3/20 | Train Loss: 0.6809 | Val Loss: 0.6760
Epoch 4/20 | Train Loss: 0.6766 | Val Loss: 0.6714
Epoch 5/20 | Train Loss: 0.6720 | Val Loss: 0.6664
Epoch 6/20 | Train Loss: 0.6672 | Val Loss: 0.6609
Epoch 7/20 | Train Loss: 0.6618 | Val Loss: 0.6549
Epoch 8/20 | Train Loss: 0.6559 | Val Loss: 0.6481
Epoch 9/20 | Train Loss: 0.6494 | Val Loss: 0.6405
Epoch 10/20 | Train Loss: 0.6420 | Val Loss: 0.6320
Epoch 11/20 | Train Loss: 0.6337 | Val Loss: 0.6225
Epoch 12/20 | Train Loss: 0.6245 | Val Loss: 0.6120
Epoch 13/20 | Train Loss: 0.6143 | Val Loss: 0.6004
Epoch 14/20 | Train Loss: 0.6031 | Val Loss: 0.5878
Epoch 15/20 | Train Loss: 0.5909 | Val Loss: 0.5742
Epoch 16/20 | Train Loss: 0.5776 | Val Loss: 0.5595
Epoch 17/20 | Train Loss: 0.5634 | Val Loss: 0.5440
Epoch 18/20 | Train Loss: 0.5482 | Val Loss: 0.5278
Epoch 19/20 | Train Loss: 0.5323 | Val Loss: 0.5111
Epoch 20/20 | Train L

In [72]:
from sklearn.metrics import f1_score, hamming_loss

model.eval()
with torch.no_grad():
    logits = model(X_test)
    probs = torch.sigmoid(logits)

    predictions = (probs > 0.5).int().cpu().numpy()
    true_labels = y_test.cpu().numpy()

micro_f1 = f1_score(true_labels, predictions, average="micro")
macro_f1 = f1_score(true_labels, predictions, average="macro")
hamming  = hamming_loss(true_labels, predictions)

print("\nKeywords Results:")
print("Micro-F1:", round(micro_f1, 4))
print("Macro-F1:", round(macro_f1, 4))
print("Hamming Loss:", round(hamming, 4))


Keywords Results:
Micro-F1: 0.0269
Macro-F1: 0.0126
Hamming Loss: 0.1379


In [74]:
import pandas as pd

results = pd.DataFrame({
    "Input Text": ["Overview", "Keywords"],
    "Micro-F1": [0.2472, 0.0269],
    "Macro-F1": [0.0331, 0.0126],
    "Hamming Loss": [0.135, 0.1379]
})

results

Unnamed: 0,Input Text,Micro-F1,Macro-F1,Hamming Loss
0,Overview,0.2472,0.0331,0.135
1,Keywords,0.0269,0.0126,0.1379


In [75]:
from collections import defaultdict, Counter

genre_word_counts = defaultdict(Counter)

for _, row in train_df.iterrows():
    genres = row["genre_list"]
    tokens = row["overview_tokens"]

    for genre in genres:
        genre_word_counts[genre].update(tokens)

In [76]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

stopwords = set(ENGLISH_STOP_WORDS)

In [77]:
filtered_genre_counts = {}

for genre, counter in genre_word_counts.items():
    filtered = {
        word: count
        for word, count in counter.items()
        if word not in stopwords and len(word) > 2
    }
    filtered_genre_counts[genre] = Counter(filtered)

In [88]:
MIN_FREQ = 3

top_words_per_genre = {}
bottom_words_per_genre = {}

for genre, counter in filtered_genre_counts.items():


    valid_words = {w: c for w, c in counter.items() if c >= MIN_FREQ}

    if len(valid_words) == 0:
        continue

    sorted_words = sorted(valid_words.items(), key=lambda x: x[1], reverse=True)

    top_words_per_genre[genre] = sorted_words[:10]
    bottom_words_per_genre[genre] = sorted_words[-10:]

In [79]:
import pandas as pd

def display_genre_table(genre):
    top_df = pd.DataFrame(top_words_per_genre[genre], columns=["Word", "Frequency"])
    bottom_df = pd.DataFrame(bottom_words_per_genre[genre], columns=["Word", "Frequency"])

    print(f"\n===== {genre} =====")
    print("\nTop 10 Words:")
    display(top_df)

    print("\nBottom 10 Words (freq ≥ 3):")
    display(bottom_df)

In [80]:
for genre in sorted(top_words_per_genre.keys()):
    display_genre_table(genre)


===== Action =====

Top 10 Words:


Unnamed: 0,Word,Frequency
0,new,122
1,world,117
2,life,90
3,young,90
4,man,83
5,time,74
6,agent,72
7,war,69
8,team,66
9,city,62



Bottom 10 Words (freq ≥ 3):


Unnamed: 0,Word,Frequency
0,jill,3
1,entei,3
2,speed,3
3,mérida,3
4,shuttle,3
5,tongan,3
6,evening,3
7,mystery,3
8,doom,3
9,joey,3



===== Adventure =====

Top 10 Words:


Unnamed: 0,Word,Frequency
0,world,107
1,new,96
2,young,87
3,life,69
4,save,55
5,help,52
6,man,52
7,war,49
8,time,48
9,family,46



Bottom 10 Words (freq ≥ 3):


Unnamed: 0,Word,Frequency
0,entei,3
1,mérida,3
2,austin,3
3,potter,3
4,shuttle,3
5,chaney,3
6,tongan,3
7,champion,3
8,doom,3
9,melvin,3



===== Animation =====

Top 10 Words:


Unnamed: 0,Word,Frequency
0,world,50
1,new,41
2,life,30
3,adventure,26
4,young,25
5,city,25
6,friends,25
7,save,24
8,family,18
9,film,18



Bottom 10 Words (freq ≥ 3):


Unnamed: 0,Word,Frequency
0,main,3
1,nickelodeon,3
2,final,3
3,riley,3
4,invade,3
5,entei,3
6,mérida,3
7,retrieve,3
8,alex,3
9,garfield,3



===== Comedy =====

Top 10 Words:


Unnamed: 0,Word,Frequency
0,life,212
1,new,210
2,love,141
3,young,134
4,friends,122
5,man,121
6,family,115
7,world,113
8,time,91
9,friend,89



Bottom 10 Words (freq ≥ 3):


Unnamed: 0,Word,Frequency
0,debo,3
1,leon,3
2,toxic,3
3,stretch,3
4,rajveer,3
5,radhika,3
6,garfield,3
7,promised,3
8,giselle,3
9,leopold,3



===== Crime =====

Top 10 Words:


Unnamed: 0,Word,Frequency
0,police,68
1,new,65
2,life,63
3,man,47
4,story,47
5,family,46
6,murder,43
7,city,42
8,agent,42
9,young,41



Bottom 10 Words (freq ≥ 3):


Unnamed: 0,Word,Frequency
0,news,3
1,federal,3
2,public,3
3,hoover,3
4,sees,3
5,mccain,3
6,wesley,3
7,polly,3
8,ashley,3
9,joey,3



===== Documentary =====

Top 10 Words:


Unnamed: 0,Word,Frequency
0,film,29
1,documentary,26
2,world,18
3,life,13
4,years,12
5,look,12
6,new,11
7,story,10
8,america,10
9,michael,10



Bottom 10 Words (freq ≥ 3):


Unnamed: 0,Word,Frequency
0,best,3
1,played,3
2,health,3
3,kids,3
4,company,3
5,obama,3
6,free,3
7,download,3
8,available,3
9,canada,3



===== Drama =====

Top 10 Words:


Unnamed: 0,Word,Frequency
0,life,373
1,young,239
2,new,236
3,story,232
4,love,193
5,family,192
6,man,189
7,world,164
8,father,130
9,years,129



Bottom 10 Words (freq ≥ 3):


Unnamed: 0,Word,Frequency
0,catalyst,3
1,randy,3
2,giants,3
3,mccain,3
4,neville,3
5,ones,3
6,widows,3
7,pig,3
8,coriolanus,3
9,tess,3



===== Family =====

Top 10 Words:


Unnamed: 0,Word,Frequency
0,world,80
1,new,66
2,life,65
3,young,62
4,family,54
5,friends,48
6,save,45
7,story,34
8,hes,33
9,help,32



Bottom 10 Words (freq ≥ 3):


Unnamed: 0,Word,Frequency
0,carmens,3
1,elliott,3
2,tongan,3
3,blanket,3
4,rajveer,3
5,radhika,3
6,horses,3
7,garfield,3
8,promised,3
9,giselle,3



===== Fantasy =====

Top 10 Words:


Unnamed: 0,Word,Frequency
0,world,65
1,life,49
2,young,49
3,new,45
4,evil,44
5,man,31
6,family,30
7,time,29
8,love,28
9,help,24



Bottom 10 Words (freq ≥ 3):


Unnamed: 0,Word,Frequency
0,author,3
1,gracey,3
2,center,3
3,dog,3
4,hades,3
5,elliott,3
6,jewel,3
7,doom,3
8,giselle,3
9,leopold,3



===== History =====

Top 10 Words:


Unnamed: 0,Word,Frequency
0,story,35
1,war,35
2,life,23
3,world,19
4,army,19
5,film,17
6,set,14
7,true,14
8,love,14
9,british,13



Bottom 10 Words (freq ≥ 3):


Unnamed: 0,Word,Frequency
0,sophie,3
1,just,3
2,gets,3
3,marriage,3
4,eddie,3
5,katadreuffe,3
6,force,3
7,malone,3
8,east,3
9,hoover,3



===== Horror =====

Top 10 Words:


Unnamed: 0,Word,Frequency
0,new,56
1,group,56
2,young,49
3,years,39
4,family,39
5,life,32
6,evil,32
7,world,31
8,mysterious,31
9,people,30



Bottom 10 Words (freq ≥ 3):


Unnamed: 0,Word,Frequency
0,underwater,3
1,stegman,3
2,moving,3
3,plagued,3
4,julie,3
5,carol,3
6,billy,3
7,christmas,3
8,rescue,3
9,train,3



===== Music =====

Top 10 Words:


Unnamed: 0,Word,Frequency
0,life,32
1,musical,21
2,new,18
3,story,17
4,music,17
5,love,16
6,world,16
7,band,15
8,film,12
9,singer,12



Bottom 10 Words (freq ≥ 3):


Unnamed: 0,Word,Frequency
0,second,3
1,hannah,3
2,change,3
3,leads,3
4,reach,3
5,tracy,3
6,destroy,3
7,tragedy,3
8,roger,3
9,canada,3



===== Mystery =====

Top 10 Words:


Unnamed: 0,Word,Frequency
0,life,37
1,new,28
2,young,27
3,world,26
4,murder,25
5,mysterious,24
6,man,24
7,soon,20
8,town,20
9,past,20



Bottom 10 Words (freq ≥ 3):


Unnamed: 0,Word,Frequency
0,romulus,3
1,clary,3
2,hollywood,3
3,rainey,3
4,question,3
5,role,3
6,eva,3
7,michael,3
8,caul,3
9,gracey,3



===== Romance =====

Top 10 Words:


Unnamed: 0,Word,Frequency
0,love,190
1,life,168
2,young,120
3,new,101
4,man,93
5,woman,79
6,story,66
7,world,64
8,finds,56
9,falls,54



Bottom 10 Words (freq ≥ 3):


Unnamed: 0,Word,Frequency
0,regime,3
1,steve,3
2,resist,3
3,rajveer,3
4,radhika,3
5,gerry,3
6,widows,3
7,giselle,3
8,leopold,3
9,tess,3



===== Science Fiction =====

Top 10 Words:


Unnamed: 0,Word,Frequency
0,world,82
1,earth,58
2,planet,56
3,new,52
4,time,50
5,alien,48
6,man,42
7,years,41
8,life,39
9,young,35



Bottom 10 Words (freq ≥ 3):


Unnamed: 0,Word,Frequency
0,corruption,3
1,mankinds,3
2,kitai,3
3,inner,3
4,speed,3
5,continue,3
6,cause,3
7,neville,3
8,mass,3
9,leopold,3



===== TV Movie =====

Top 10 Words:


Unnamed: 0,Word,Frequency
0,christmas,4



Bottom 10 Words (freq ≥ 3):


Unnamed: 0,Word,Frequency
0,christmas,4



===== Thriller =====

Top 10 Words:


Unnamed: 0,Word,Frequency
0,new,116
1,young,116
2,life,112
3,man,107
4,world,79
5,story,74
6,family,73
7,agent,69
8,finds,68
9,years,63



Bottom 10 Words (freq ≥ 3):


Unnamed: 0,Word,Frequency
0,reach,3
1,facing,3
2,gracey,3
3,mccain,3
4,madison,3
5,advantage,3
6,champion,3
7,coriolanus,3
8,joey,3
9,melvin,3



===== War =====

Top 10 Words:


Unnamed: 0,Word,Frequency
0,war,51
1,army,23
2,world,22
3,soldiers,13
4,young,13
5,british,12
6,story,12
7,group,11
8,film,11
9,battle,11



Bottom 10 Words (freq ≥ 3):


Unnamed: 0,Word,Frequency
0,local,3
1,mother,3
2,rafael,3
3,americans,3
4,civilians,3
5,nanking,3
6,netherlands,3
7,way,3
8,east,3
9,russian,3



===== Western =====

Top 10 Words:


Unnamed: 0,Word,Frequency
0,town,20
1,west,13
2,gang,11
3,young,10
4,old,9
5,american,9
6,sheriff,8
7,war,8
8,men,8
9,texas,8



Bottom 10 Words (freq ≥ 3):


Unnamed: 0,Word,Frequency
0,averill,3
1,battle,3
2,billy,3
3,murder,3
4,lawman,3
5,hogue,3
6,stagecoach,3
7,bandits,3
8,years,3
9,chaney,3


In [81]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(
    max_features=20000,
    stop_words='english',
    min_df=3
)

X_train_tfidf = tfidf_vectorizer.fit_transform(train_df["overview"])
X_val_tfidf   = tfidf_vectorizer.transform(val_df["overview"])
X_test_tfidf  = tfidf_vectorizer.transform(test_df["overview"])

feature_names = tfidf_vectorizer.get_feature_names_out()

In [83]:
from sklearn.linear_model import LogisticRegression

genre_models = {}

for genre, idx in genre_to_idx.items():

    y_binary = y_train_multi[:, idx]  # 0/1 labels

    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train_tfidf, y_binary)

    genre_models[genre] = clf

In [85]:
indicative_words = {}

for genre, clf in genre_models.items():

    coefs = clf.coef_[0]

    top_indices = coefs.argsort()[-10:][::-1]

    top_words = [(feature_names[i], coefs[i]) for i in top_indices]

    indicative_words[genre] = top_words

In [86]:
import pandas as pd

for genre in sorted(indicative_words.keys()):

    print(f"\n===== {genre} =====")

    df_genre = pd.DataFrame(indicative_words[genre],
                            columns=["Word", "Coefficient"])

    display(df_genre)


===== Action =====


Unnamed: 0,Word,Coefficient
0,agent,2.561577
1,cop,2.239787
2,criminals,2.180777
3,ruthless,2.087417
4,cia,1.800131
5,hero,1.785827
6,james,1.687644
7,forces,1.576335
8,avenge,1.50887
9,stop,1.49866



===== Adventure =====


Unnamed: 0,Word,Coefficient
0,adventure,2.116755
1,bond,2.053627
2,save,1.714517
3,king,1.6777
4,jungle,1.606429
5,earth,1.541984
6,forces,1.51645
7,world,1.506005
8,power,1.502433
9,dragon,1.469004



===== Animation =====


Unnamed: 0,Word,Coefficient
0,adventure,2.526161
1,animated,1.923619
2,world,1.70158
3,human,1.658371
4,dragon,1.496275
5,city,1.328658
6,save,1.28546
7,animals,1.271992
8,forest,1.172687
9,unlikely,1.117958



===== Comedy =====


Unnamed: 0,Word,Coefficient
0,comedy,3.045028
1,big,1.638708
2,friends,1.574568
3,store,1.490385
4,girlfriend,1.47143
5,movie,1.379253
6,vacation,1.323548
7,guy,1.319365
8,parents,1.31591
9,decides,1.293275



===== Crime =====


Unnamed: 0,Word,Coefficient
0,police,3.599592
1,cop,2.360467
2,fbi,2.252152
3,crime,2.241282
4,murder,2.146607
5,drug,2.077893
6,detective,1.835246
7,agent,1.828687
8,criminal,1.692517
9,mob,1.613839



===== Documentary =====


Unnamed: 0,Word,Coefficient
0,documentary,4.26551
1,look,2.349373
2,film,1.535186
3,michael,1.256694
4,interviews,1.079987
5,footage,1.05173
6,tour,1.033841
7,filmmaker,1.030873
8,journey,0.860447
9,america,0.858236



===== Drama =====


Unnamed: 0,Word,Coefficient
0,story,2.821986
1,life,2.303386
2,wife,2.047316
3,husband,1.623437
4,drama,1.622629
5,father,1.43648
6,war,1.434677
7,love,1.432591
8,mother,1.39746
9,struggle,1.310367



===== Family =====


Unnamed: 0,Word,Coefficient
0,save,2.237489
1,dog,2.231227
2,adventure,2.174087
3,kids,2.010968
4,world,1.797777
5,boy,1.736172
6,christmas,1.643986
7,animals,1.535694
8,adventures,1.375972
9,land,1.355533



===== Fantasy =====


Unnamed: 0,Word,Coefficient
0,evil,2.672862
1,ancient,1.910728
2,world,1.799248
3,vampire,1.677624
4,king,1.674244
5,magic,1.645741
6,powers,1.625849
7,dragon,1.575381
8,magical,1.563965
9,battle,1.546981



===== History =====


Unnamed: 0,Word,Coefficient
0,war,2.646624
1,story,1.712257
2,army,1.51815
3,queen,1.464774
4,soldiers,1.426399
5,british,1.371547
6,chronicle,1.31201
7,true,1.241499
8,nation,1.207656
9,allied,1.119656



===== Horror =====


Unnamed: 0,Word,Coefficient
0,horror,2.890878
1,group,2.468817
2,vampire,2.124894
3,zombies,2.044966
4,evil,1.849991
5,vampires,1.765372
6,deadly,1.750574
7,mysterious,1.696983
8,people,1.621942
9,dead,1.577842



===== Music =====


Unnamed: 0,Word,Coefficient
0,musical,3.658314
1,music,2.322896
2,singer,2.202954
3,band,1.852682
4,dancer,1.603552
5,rock,1.37734
6,concert,1.253014
7,broadway,1.140319
8,bands,1.091923
9,dance,1.091052



===== Mystery =====


Unnamed: 0,Word,Coefficient
0,murder,1.947611
1,investigation,1.700069
2,murdered,1.656027
3,past,1.600751
4,mysterious,1.552687
5,detective,1.487229
6,clues,1.326309
7,investigates,1.264163
8,killer,1.23874
9,fbi,1.211963



===== Romance =====


Unnamed: 0,Word,Coefficient
0,love,5.693886
1,romance,2.581026
2,woman,2.472747
3,falls,2.231259
4,meets,2.164693
5,romantic,2.07317
6,marriage,1.902688
7,life,1.781772
8,relationship,1.780077
9,girl,1.648511



===== Science Fiction =====


Unnamed: 0,Word,Coefficient
0,planet,3.832261
1,alien,3.685352
2,earth,3.430483
3,future,2.879089
4,space,2.092736
5,time,1.922435
6,robot,1.815576
7,world,1.706693
8,human,1.641488
9,travel,1.626106



===== TV Movie =====


Unnamed: 0,Word,Coefficient
0,christmas,0.55564
1,worked,0.397415
2,brown,0.342022
3,spy,0.333024
4,service,0.326339
5,ii,0.314901
6,office,0.30643
7,charlie,0.279611
8,run,0.274718
9,sharkinfested,0.273104



===== Thriller =====


Unnamed: 0,Word,Coefficient
0,agent,2.326931
1,murder,1.827622
2,kidnapped,1.675723
3,killer,1.588434
4,assassin,1.573233
5,secret,1.543876
6,murdered,1.514569
7,police,1.486384
8,dangerous,1.423998
9,events,1.414833



===== War =====


Unnamed: 0,Word,Coefficient
0,war,4.790318
1,army,2.495632
2,soldiers,1.881947
3,ii,1.561135
4,vietnam,1.326268
5,officer,1.317048
6,british,1.230232
7,village,1.182873
8,allied,1.11004
9,battle,1.043161



===== Western =====


Unnamed: 0,Word,Coefficient
0,west,2.296551
1,town,1.997144
2,civil,1.502725
3,sheriff,1.26862
4,outlaw,1.267235
5,gang,1.238545
6,cattle,1.18145
7,texas,1.146315
8,bounty,1.009643
9,old,1.001654
