In [2]:
import pandas as pd
import re
import string

In [3]:
df= pd.read_csv('./movies.csv', index_col=False)

In [4]:
df.head()

Unnamed: 0,title,description,genres
0,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,"Drama, Crime"
1,The Godfather,The story of the Corleone crime family as Mich...,"Drama, Crime"
2,The Godfather Part II,Parallel stories of young Vito Corleoneâ€™s rise...,"Drama, Crime"
3,Schindler's List,"The true story of Oskar Schindler, who saved o...","Drama, History, War"
4,12 Angry Men,Twelve jurors deliberate the fate of a young m...,Drama


In [5]:
df.columns

Index(['title', 'description', 'genres'], dtype='object')

In [6]:
def clean_text(text):
    if pd.isna(text):  # handle missing values
        return ""
    text = text.strip()  # remove leading/trailing spaces
    text = re.sub(r'\s+', ' ', text)  # replace multiple spaces with single space
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s,.!?]', '', text) 
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

df['title'] = df['title'].apply(clean_text)
df['description'] = df['description'].apply(clean_text)
df['genres'] = df['genres'].apply(clean_text)

df.to_csv("movies_cleaned.csv", index=False)

print("CSV cleaned (punctuation removed) and saved as movies_cleaned.csv")

CSV cleaned (punctuation removed) and saved as movies_cleaned.csv


In [7]:
df = pd.read_csv('./movies_cleaned.csv')

In [8]:
df.head()

Unnamed: 0,title,description,genres
0,the shawshank redemption,imprisoned in the 1940s for the double murder ...,drama crime
1,the godfather,the story of the corleone crime family as mich...,drama crime
2,the godfather part ii,parallel stories of young vito corleones rise ...,drama crime
3,schindlers list,the true story of oskar schindler who saved ov...,drama history war
4,12 angry men,twelve jurors deliberate the fate of a young m...,drama


## Bag of words

In [10]:
# conerting text to nums (bag of words approach)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

bow = cv.fit_transform(df['description'])

In [14]:
print(cv.vocabulary_)

{'imprisoned': 447, 'in': 448, 'the': 936, '1940s': 5, 'for': 348, 'double': 261, 'murder': 608, 'of': 644, 'his': 418, 'wife': 1054, 'and': 44, 'her': 412, 'lover': 546, 'banker': 76, 'andy': 45, 'dufresne': 268, 'builds': 122, 'new': 630, 'life': 531, 'shawshank': 836, 'prison': 712, 'earning': 273, 'respect': 765, 'through': 954, 'integrity': 470, 'hope': 424, 'story': 890, 'corleone': 185, 'crime': 190, 'family': 314, 'as': 58, 'michael': 577, 'rises': 780, 'to': 962, 'power': 703, 'after': 24, 'an': 43, 'attempt': 66, 'on': 650, 'fathers': 319, 'parallel': 668, 'stories': 889, 'young': 1074, 'vito': 1032, 'corleones': 186, 'rise': 779, 'expansion': 306, 'empire': 283, 'true': 989, 'oskar': 657, 'schindler': 809, 'who': 1051, 'saved': 806, 'over': 662, 'thousand': 950, 'jewish': 496, 'lives': 540, 'from': 367, 'nazis': 624, 'during': 270, 'world': 1068, 'war': 1033, 'ii': 440, 'twelve': 993, 'jurors': 507, 'deliberate': 223, 'fate': 317, 'man': 562, 'accused': 17, 'exposing': 309, 

In [15]:
print(bow[0].toarray())

[[0 0 0 ... 0 0 0]]


## Bag of n-grams

in bigram take two wors , in tri three words and so on

In [None]:
cv = CountVectorizer(ngram_range=(2,2)) # modify it as you like for tri and quad and so on

bi_gram = bow = cv.fit_transform(df['description'])

In [17]:
print(cv.vocabulary_)

{'imprisoned in': 824, 'in the': 851, 'the 1940s': 1586, '1940s for': 6, 'for the': 594, 'the double': 1610, 'double murder': 463, 'murder of': 1103, 'of his': 1177, 'his wife': 784, 'wife and': 1915, 'and her': 110, 'her lover': 723, 'lover banker': 1019, 'banker andy': 213, 'andy dufresne': 160, 'dufresne builds': 470, 'builds new': 291, 'new life': 1140, 'life in': 985, 'in shawshank': 847, 'shawshank prison': 1452, 'prison earning': 1304, 'earning respect': 476, 'respect through': 1368, 'through integrity': 1728, 'integrity and': 875, 'and hope': 112, 'the story': 1675, 'story of': 1517, 'of the': 1193, 'the corleone': 1599, 'corleone crime': 369, 'crime family': 377, 'family as': 527, 'as michael': 184, 'michael corleone': 1063, 'corleone rises': 370, 'rises to': 1386, 'to power': 1765, 'power after': 1290, 'after an': 26, 'an attempt': 64, 'attempt on': 197, 'on his': 1215, 'his fathers': 756, 'fathers life': 537, 'parallel stories': 1254, 'stories of': 1516, 'of young': 1206, 'y

## tf-idf vectorization


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf= TfidfVectorizer()
tfidf.fit_transform(df['description']).toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])