In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [None]:
train_data=pd.read_csv('train_data.txt', delimiter=':::', names=['ID','TITLE','GENRE','DESCRIPTION'], engine='python')
test_data=pd.read_csv('test_data.txt',delimiter=":::", names=['ID','TITLE','DESCRIPTION'],engine='python')

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
def clean_text(text):
    tokens = nltk.word_tokenize(text)
    tokens = [word.lower() for word in tokens]
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    return " ".join(tokens)

In [None]:
train_data['cleaned_description'] = train_data['DESCRIPTION'].apply(clean_text)
test_data['cleaned_description'] = test_data['DESCRIPTION'].apply(clean_text)

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train = tfidf_vectorizer.fit_transform(train_data['cleaned_description'])
X_test = tfidf_vectorizer.transform(test_data['cleaned_description'])

In [None]:
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, train_data['GENRE'])

In [None]:
from joblib import dump

model_file = 'svm_model.joblib'
dump(svm_classifier, model_file)


['svm_model.joblib']

In [None]:
from joblib import load

svm_classifier = load('svm_model.joblib')


In [None]:
predictions = svm_classifier.predict(X_test)

In [None]:

for i in range(54200):
    print("Movie Name:",test_data['TITLE'].iloc[i])

    print("Predicted Genre:", predictions[i])
    print("Movie Description:", test_data['DESCRIPTION'].iloc[i])
    print()


test_data['PREDICTED_GENRE'] = predictions
test_data.to_csv('predictions.csv', index=False)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Movie Name:  Huellas de un siglo (2010) 
Predicted Genre:  documentary 
Movie Description:  "Fingerprints of a Century" (Huellas de un siglo) is a series of 26 documentaries for television. The subject matter includes, the Centenary of the Republic Argentina, Alcorta's shout (the creation of the Argentine Agrarian Federation), The war of Malvinas (1982), The Tragic Week, The earthquake of San Juan, The Cordobazo, The "puebladas", Perón's arrival, The return of the democracy (Alfonsín, 1983), and some other milestones of the Argentine history.

Movie Name:  The Rookie's Return (1920) 
Predicted Genre:  drama 
Movie Description:  A young soldier is discharged from the service and has trouble making a living. However, when he inherits a great deal of money, he finds his troubles only beginning.

Movie Name:  La sagrada familia (2005) 
Predicted Genre:  drama 
Movie Description:  It's Easter in the beach house of a Chilean w

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Movie Name:  The Dorp: 40 Days of Our Lives (2009) 
Predicted Genre:  drama 
Movie Description:  A place on the Cape Flats in 2009: A society challenged by gang violence and drug abuse but with people involved that want to make a difference. We follow four characters representing four generations (childhood, youth, adulthood and senior) through their neighborhood. They see their surroundings out of different angles and find themselves in different situations. But they all wish for better, for a brighter future. You can feel that people want to do better but they are very often trapped in a circle of poverty that is hard to breach. The frustration of the citizens of THE DORP finds expression in a violent clash with the Police, one of the biggest community uprisings against the authorities in the area since Apartheid has ended.

Movie Name:  Four in the Afternoon (1951) 
Predicted Genre:  short 
Movie Description:  Poems na

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Movie Name:  "Ischeznuvshie" (2009) 
Predicted Genre:  documentary 
Movie Description:  1942. Major Toporkov escaped from captivity and sneaks into a guerrilla camp, which is surrounded by the Germans. Mayor agrees with the commander to have an access to the weapons, to take to the concentration camp, where the prisoners are preparing an uprising.

Movie Name:  Mein ganz gewöhnliches Leben (2005) 
Predicted Genre:  comedy 
Movie Description:  After a quick breakfast with his parents, Ben leaves the house and takes his bike for a fast and risky ride. While the opening credits are shown we see some spectacular camera movements following Ben on his way to school with the bike. The music is some kind of pop-rock; very fast editing. The audience must think: the boy on the bike is the cool guy in this movie. However this is a big mistake: Ben hardly enters the school when some of his classmates pull him to the restroom and bea

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Movie Name:  The Greatest Game Ever Played (2008) 
Predicted Genre:  documentary 
Movie Description:  The 1958 National Football League championship game between the Baltimore Colts and the New York Giants is known as "The Greatest Game Ever Played". It was the first (and only) professional football title game to ever result in an overtime, featured 17 men who were ultimately inducted into the Pro Football Hall of Fame and included wild swings of momentum and the Colts execution of their two-minute drill to tie the game in the closing seconds of regulation time. To commemorate the 50th anniversary, ESPN films paired eleven of the games participants with current players from the Indianapolis Colts and New York Giants to compare the nature of the sport 50 years later, as well as sportscasters, reporters, photographers, spectators, cheerleaders, majorettes and marching band members who contributed their reminiscences. "To co

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Movie Name:  Balala the Fairies: The Movie (2013) 
Predicted Genre:  fantasy 
Movie Description:  The film is mainly about the fairies's fate "star of the key" mysterious fall, Blue fairy back to the human world, with Qi Mei fairy and Xue Mei fairy together, Guarding the positive energy, exposing the big conspiracy and defending the fairy castle.

Movie Name:  East End Babylon (2012) 
Predicted Genre:  documentary 
Movie Description:  The story of London's toughest and poorest part as told through the eyes of its most iconic band. From the bombs that flew in World War II and from the greatest industrial docks the world ever saw, to the formation of the original and best Terrace Band of them all, the battles, living outside the law, the wilderness years of both the band and the area that spawned them, and eventually to the rebirth and transformation of the band into a worldwide cult, this is the rockumentary to beat them 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Movie Description:  Based on James Barrie's play "Alice Sit-By-The-Fire". In turn-of-the-century New York, a young girl who believes she's learned "the seamy side of life" from a risque play takes it upon herself to rescue her estranged mother from what appears to be a steamy love affair.

Movie Name:  Octia of the Pink Ocean (2014) 
Predicted Genre:  drama 
Movie Description:  'Who am I? Where has my memory gone?' Karnika, Insect-Angel of GoldSun Sex and Goddess of the Pink Ocean, has been resurrected with no memory of her past. Alone in what appears to be a perfect island paradise, she soon uncovers a terrible plot to destroy the ocean world and all of its inhabitants. Alongside her lover Octia, Karnika battles Mermecho, an evil Shark/Machine hybrid determined to cover the world with The Blackening. Rediscovering her power along the way, both Karnika and Octia fall prey to Mermecho's devious traps. But if a Goddess can'

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Movie Name:  The 64th Annual Academy Awards (1992) 
Predicted Genre:  family 
Movie Description:  Award of the American academy of cinematographic arts and sciences, from 1940th known as "Oscar", - American film award created in 1929 and traditionally handed to the figures of cinematographic art for their contribution to creation of movies.

Movie Name:  With You (2006/I) 
Predicted Genre:  drama 
Movie Description:  Tired of the corporate grind, Jim Polk quits his "9-to-5" job and leaves the big city in search of a simpler life. Renting a modest farmhouse in the small town of Mt. Prospect, Jim quickly discovers that there's more to this town, and his new home, than first meets the eye. Jim's bedroom overlooks Moloch Manor, a distant, decaying mansion that residents say is haunted. Local legend claims the spirit of Old Amon roams Moloch Manor's grounds as well as Mt. Prospect itself. Soon, strange and frightening events 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Movie Name:  "Dog Bless America" (2016) 
Predicted Genre:  documentary 
Movie Description:  Historical icon Paul Revere once again finds his call to ride in modern-day America with a new boisterous proclamation to the country. Each episode bridges together to create one continuous mad dash of a narrative and hilariously irreverent approach to brand driven creative content.

Movie Name:  "Broken House Chronicles" (2002) 
Predicted Genre:  comedy 
Movie Description:  Broken House Chronicles is the anti-This Old House. You won't find professional contractors or laser guided chop saws -- instead, George and Leigh show that a couple of regular people can do home improvement projects, while showing the materials necessary, common problems you might run into, tools necessary and/or available for rental, and occassionally breaking into song. It's entertaining, yet educational -- and it might inspire you to do some weekend projec

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Movie Name:  Cheryomushki (1963) 
Predicted Genre:  drama 
Movie Description:  The title of the film (literally "Cherry Town") refers to new towns or neighborhoods based on middle-class urban development, where every block of houses looks exactly the same as the next. In the Soviet Union of the late 1950s this was thought to be equivalent to the pot of gold at the end of the rainbow. This musical involves the tribulations of four couples whose lives humorously intersect. It opens with Sergei, a driver, meeting up with his good friend Boris. After lamenting being alone, Boris falls head over heals for a museum guide, Lida. She resists his advances but that doesn't stop Boris. She leads him to her friends, Masha and Sasha, who are planning to get married and are elated to have recently received permission to move into an apartment in the Moscow Cheryomushki. On her way home (with Boris constantly tagging along), Lida discov