## Imports

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation
import json
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text  import TfidfVectorizer
from sklearn.model_selection import train_test_split
#
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
#
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
pd.set_option('display.max_columns', None)

## Loading and exploring dataset <br>
[Dataset link]('https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset?select=movies_metadata.csv')

In [None]:
df = pd.read_csv('movies_metadata.csv', low_memory=False)
df.head()

In [None]:
df.columns

In [None]:
print(df['genres'][1])
print(type(df['genres'][1]))

In [None]:
df[['original_title' , 'genres', 'tagline', 'overview']].isna().sum()

## Data cleaning

After I explored the dataset. I found many NaN and unwanted column - for our pupose,  in the dataset. So, I made subset of that are required. and will remove the Rows that  contain NaN<br>
`genres`, `tagline`, `overview`

In [None]:
df_subset = df[['genres', 'tagline', 'overview']]
df_subset.head()

In [None]:
df_subset = df_subset.reset_index(drop=True)
print(df_subset.shape)
df_subset.head()

Droping rows containing NaN values

In [None]:
df_subset = df_subset.dropna()
df_subset.shape
df_subset

Converting the genre column to string

In [None]:
def handel_genre(text):
    text = text.replace("'", '"')
    data = json.loads(text)
    return (' '.join(item['name'] for item in data )).lower()

.apply() function in pandas is used to apply a function in dataframe collumn  <br>
I have updated the 'genres' column with new one.

* "[{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10751, 'name': 'Family'}]" <br>
TO <br>
* Adventure Fantasy Family

In [None]:
df_subset['genres'] = df_subset['genres'].apply(handel_genre)
df_subset.head()

## Making the function to tokenize the texts and implementing in dataset.

- We have to tokenize the text while doing NLP because every words do not give us the context of sentence. <Br>
for instance, `he` `is` `a` boy`.` ONLY boy is our tag mean which gives us the general context. Same goes with punctuation. <br>
This will help in reasulting better accuray in most cases and reduce computations.

In [None]:
def tokenize(text):
    stop_words = set(stopwords.words('english'))
    text_words = word_tokenize(text, 'english')
    text_token = [tokens for tokens in text_words if tokens.lower() not in stop_words and  tokens.lower() not in punctuation]
    return (' '.join(text_token)).lower()

In [None]:
df_subset['tagline'] = df_subset['tagline'].apply(tokenize)
df_subset['overview'] = df_subset['overview'].apply(tokenize)
df_subset.head()

## Encoding genres For classification

In [None]:
mlb = MultiLabelBinarizer()
genre_encoded = pd.DataFrame(mlb.fit_transform(df_subset['genres'].apply(lambda x: x.split())), columns=mlb.classes_)
df_subset = pd.concat([df_subset, genre_encoded], axis=1)
df_subset = df_subset.drop('genres', axis=1)
df_subset.head()

## Vectorizer

As text of both overview and tagline can be used for prediction I merged it together for simple approch

In [None]:
df_subset['combined_text'] = df_subset['tagline'] + ' ' + df_subset['overview']
df_subset.head()

In [None]:
df_subset = df_subset.dropna()
df_subset.shape

In [None]:
df_subset.head(1)

Vectorizing using TFID

In [None]:
vect = TfidfVectorizer(max_features=800)
vect_features = vect.fit_transform(df_subset['combined_text'])

In [None]:
vect.get_feature_names_out()

In [None]:
tfidf_df = pd.DataFrame(vect_features.toarray(), columns=vect.get_feature_names_out())
df_subset = df_subset.drop(['tagline', 'overview', 'combined_text'], axis=1)

In [None]:
tfidf_df.head()

In [None]:
df_subset = pd.concat([df_subset.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)
df_subset.head()

In [None]:
X  = df_subset.iloc[:, 22:]
X.head()

In [None]:
y  = df_subset.iloc[:, :22]
y.head()

## Splits and further

In [None]:
genre_columns = ['action', 'adventure', 'animation', 'comedy', 'crime', 'documentary', 'drama', 'family', 'fantasy', 'fiction', 'foreign', 'history', 'horror', 'movie', 'music', 'mystery', 'romance', 'science', 'thriller', 'tv', 'war', 'western']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_train.shape)

In [None]:

clf = OneVsRestClassifier(LogisticRegression(max_iter=1000))

clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
print(y_pred)

In [None]:
print(classification_report(y_test, y_pred, target_names=genre_columns))

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)