## Downloading the Dataset

In [1]:
!gdown https://drive.google.com/u/0/uc?id=1v9u3NTeJtlIYdNGLqMa2kZNTGjMCvjjh

Downloading...
From: https://drive.google.com/u/0/uc?id=1v9u3NTeJtlIYdNGLqMa2kZNTGjMCvjjh
To: /content/MovieSummaries.tar
100% 134M/134M [00:01<00:00, 99.7MB/s]


In [2]:
!ls

MovieSummaries.tar  sample_data


In [3]:
!tar -xf MovieSummaries.tar && ls

MovieSummaries	MovieSummaries.tar  sample_data


# Loading and Cleaning

In [4]:
import pandas as pd

In [5]:
#reading file with genres
columns = ["wikipedia id", "freebase id", "movie name", "year released", "revenue", "runtime", "language", "country released", "genres"]
metadata = pd.read_csv("./MovieSummaries/movie.metadata.tsv", sep = "\t", header=None, names = columns)

metadata

Unnamed: 0,wikipedia id,freebase id,movie name,year released,revenue,runtime,language,country released,genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"
...,...,...,...,...,...,...,...,...,...
81736,35228177,/m/0j7hxnt,Mermaids: The Body Found,2011-03-19,,120.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/07s9rl0"": ""Drama""}"
81737,34980460,/m/0g4pl34,Knuckle,2011-01-21,,96.0,"{""/m/02h40lc"": ""English Language""}","{""/m/03rt9"": ""Ireland"", ""/m/07ssc"": ""United Ki...","{""/m/03bxz7"": ""Biographical film"", ""/m/07s9rl0..."
81738,9971909,/m/02pygw1,Another Nice Mess,1972-09-22,,66.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/06nbt"": ""Satire"", ""/m/01z4y"": ""Comedy""}"
81739,913762,/m/03pcrp,The Super Dimension Fortress Macross II: Lover...,1992-05-21,,150.0,"{""/m/03_9r"": ""Japanese Language""}","{""/m/03_3d"": ""Japan""}","{""/m/06n90"": ""Science Fiction"", ""/m/0gw5n2f"": ..."


In [6]:
import json
#genres column is in json format, so load column as dictionary
metadata.genres = metadata.genres.apply(json.loads)

#get only genres in each dictionary for each row
metadata.genres = metadata.genres.apply(lambda x : list(x.values()))

metadata

Unnamed: 0,wikipedia id,freebase id,movie name,year released,revenue,runtime,language,country released,genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","[Thriller, Science Fiction, Horror, Adventure,..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","[Mystery, Biographical film, Drama, Crime Drama]"
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","[Crime Fiction, Drama]"
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","[Thriller, Erotic thriller, Psychological thri..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}",[Drama]
...,...,...,...,...,...,...,...,...,...
81736,35228177,/m/0j7hxnt,Mermaids: The Body Found,2011-03-19,,120.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}",[Drama]
81737,34980460,/m/0g4pl34,Knuckle,2011-01-21,,96.0,"{""/m/02h40lc"": ""English Language""}","{""/m/03rt9"": ""Ireland"", ""/m/07ssc"": ""United Ki...","[Biographical film, Drama, Documentary]"
81738,9971909,/m/02pygw1,Another Nice Mess,1972-09-22,,66.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","[Satire, Comedy]"
81739,913762,/m/03pcrp,The Super Dimension Fortress Macross II: Lover...,1992-05-21,,150.0,"{""/m/03_9r"": ""Japanese Language""}","{""/m/03_3d"": ""Japan""}","[Science Fiction, Japanese Movies, Adventure, ..."


In [7]:
#reading summary file
columns = ["wikipedia id", "summary"]
summ = pd.read_csv("./MovieSummaries/plot_summaries.txt", sep = "\t", header=None, names = columns)

summ

Unnamed: 0,wikipedia id,summary
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...
...,...,...
42298,34808485,"The story is about Reema , a young Muslim scho..."
42299,1096473,"In 1928 Hollywood, director Leo Andreyev look..."
42300,35102018,American Luthier focuses on Randy Parsons’ tra...
42301,8628195,"Abdur Rehman Khan , a middle-aged dry fruit se..."


In [8]:
#drop unneeded columns
meta = metadata.drop(["freebase id", "year released", "revenue", "runtime", "country released", 'language'], axis=1)

#merge dataframes with genres and summaries 
df = pd.merge(meta, summ, how="inner", on="wikipedia id")

df

Unnamed: 0,wikipedia id,movie name,genres,summary
0,975900,Ghosts of Mars,"[Thriller, Science Fiction, Horror, Adventure,...","Set in the second half of the 22nd century, th..."
1,9363483,White Of The Eye,"[Thriller, Erotic thriller, Psychological thri...",A series of murders of rich young women throug...
2,261236,A Woman in Flames,[Drama],"Eva, an upper class housewife, becomes frustra..."
3,18998739,The Sorcerer's Apprentice,"[Family Film, Fantasy, Adventure, World cinema]","Every hundred years, the evil Morgana returns..."
4,6631279,Little city,"[Romantic comedy, Ensemble Film, Comedy-drama,...","Adam, a San Francisco-based artist who works a..."
...,...,...,...,...
42199,23851782,The Ghost Train,"[Crime Fiction, Thriller, Comedy, Supernatural]",{{plot}} The film opens with a Great Western e...
42200,35228177,Mermaids: The Body Found,[Drama],Two former National Oceanic Atmospheric Admini...
42201,34980460,Knuckle,"[Biographical film, Drama, Documentary]",{{No plot}} This film follows 12 years in the ...
42202,913762,The Super Dimension Fortress Macross II: Lover...,"[Science Fiction, Japanese Movies, Adventure, ...","The story takes place in the year 2092,The Sup..."


In [9]:
from collections import Counter 

genre_counter = Counter()

df.genres.apply(genre_counter.update)

genre_counter


Counter({'Thriller': 6530,
         'Science Fiction': 2339,
         'Horror': 4082,
         'Adventure': 3248,
         'Supernatural': 571,
         'Action': 5868,
         'Space western': 12,
         'Erotic thriller': 170,
         'Psychological thriller': 1138,
         'Drama': 19134,
         'Family Film': 3219,
         'Fantasy': 2012,
         'World cinema': 5153,
         'Romantic comedy': 2075,
         'Ensemble Film': 349,
         'Comedy-drama': 1261,
         'Comedy': 10467,
         'Romantic drama': 2572,
         'Romance Film': 6666,
         'Costume drama': 367,
         'War film': 1556,
         'Epic': 196,
         'Period piece': 1321,
         'Film adaptation': 1225,
         'Animation': 2441,
         'Short Film': 3192,
         "Children's/Family": 759,
         'Musical': 2414,
         "Children's Fantasy": 244,
         'Indie': 3668,
         'Crime Fiction': 4275,
         'Coming of age': 741,
         'Bollywood': 1058,
         'Fan f

In [10]:
genre_counter.most_common(35)

[('Drama', 19134),
 ('Comedy', 10467),
 ('Romance Film', 6666),
 ('Thriller', 6530),
 ('Action', 5868),
 ('World cinema', 5153),
 ('Crime Fiction', 4275),
 ('Horror', 4082),
 ('Black-and-white', 3731),
 ('Indie', 3668),
 ('Action/Adventure', 3553),
 ('Adventure', 3248),
 ('Family Film', 3219),
 ('Short Film', 3192),
 ('Romantic drama', 2572),
 ('Animation', 2441),
 ('Musical', 2414),
 ('Science Fiction', 2339),
 ('Mystery', 2119),
 ('Romantic comedy', 2075),
 ('Fantasy', 2012),
 ('Comedy film', 1778),
 ('Crime Thriller', 1682),
 ('War film', 1556),
 ('Period piece', 1321),
 ('Japanese Movies', 1290),
 ('Comedy-drama', 1261),
 ('Film adaptation', 1225),
 ('Documentary', 1213),
 ('Silent film', 1199),
 ('Psychological thriller', 1138),
 ('Bollywood', 1058),
 ('Western', 1022),
 ('Chinese Movies', 998),
 ('Black comedy', 834)]

In [11]:
#lowercase genres 
def genre_lower(genre_l):
    for i in range(len(genre_l)):
        genre_l[i] = genre_l[i].lower()
    return genre_l

df.genres = df.genres.apply(genre_lower)

df

Unnamed: 0,wikipedia id,movie name,genres,summary
0,975900,Ghosts of Mars,"[thriller, science fiction, horror, adventure,...","Set in the second half of the 22nd century, th..."
1,9363483,White Of The Eye,"[thriller, erotic thriller, psychological thri...",A series of murders of rich young women throug...
2,261236,A Woman in Flames,[drama],"Eva, an upper class housewife, becomes frustra..."
3,18998739,The Sorcerer's Apprentice,"[family film, fantasy, adventure, world cinema]","Every hundred years, the evil Morgana returns..."
4,6631279,Little city,"[romantic comedy, ensemble film, comedy-drama,...","Adam, a San Francisco-based artist who works a..."
...,...,...,...,...
42199,23851782,The Ghost Train,"[crime fiction, thriller, comedy, supernatural]",{{plot}} The film opens with a Great Western e...
42200,35228177,Mermaids: The Body Found,[drama],Two former National Oceanic Atmospheric Admini...
42201,34980460,Knuckle,"[biographical film, drama, documentary]",{{No plot}} This film follows 12 years in the ...
42202,913762,The Super Dimension Fortress Macross II: Lover...,"[science fiction, japanese movies, adventure, ...","The story takes place in the year 2092,The Sup..."


In [12]:
import re

# targets 
regex_list = ['drama', 'comedy', 'action', 'adventure', 'roman', 'crime',
              'thriller', 'family', 'world cinema', 'horror',
              'short film', 'science fiction', 'mystery', 
              'fantasy', 'documentary', 'silent film']

# compiled regex
compiled_regex = [re.compile(regex) for regex in regex_list]


# collapse genres to only include those in targets
def collapse_genres(genres, regex_list, compiled_regex):
  new_gen_l = []

  for i, co_regex in enumerate(compiled_regex): 
    found = list(filter(co_regex.match, genres))
    if found: 
      if regex_list[i] == 'roman': 
        new_gen_l.append('romance')
      else: 
        new_gen_l.append(regex_list[i])

  return new_gen_l
   

df.genres = df.genres.apply(lambda l: collapse_genres(l, regex_list, compiled_regex))

# remove rows with empty genre lists 
df = df[df.genres.astype(bool)]


# remove rows with only one genre listed
df = df[df.genres.map(len) > 1]

df

Unnamed: 0,wikipedia id,movie name,genres,summary
0,975900,Ghosts of Mars,"[action, adventure, thriller, horror, science ...","Set in the second half of the 22nd century, th..."
3,18998739,The Sorcerer's Apprentice,"[adventure, family, world cinema, fantasy]","Every hundred years, the evil Morgana returns..."
4,6631279,Little city,"[drama, comedy, romance]","Adam, a San Francisco-based artist who works a..."
7,11250635,The Mechanical Monsters,"[action, adventure, family, short film, scienc...",The story starts as one of the robots flies i...
8,77856,Mary Poppins,"[drama, comedy, family, fantasy]",The film opens with Mary Poppins perched in a...
...,...,...,...,...
42198,26482675,Eşrefpaşalılar,"[drama, comedy]","The film is about two friends, Tayyar , a mafi..."
42199,23851782,The Ghost Train,"[comedy, crime, thriller]",{{plot}} The film opens with a Great Western e...
42201,34980460,Knuckle,"[drama, documentary]",{{No plot}} This film follows 12 years in the ...
42202,913762,The Super Dimension Fortress Macross II: Lover...,"[drama, adventure, short film, science fiction]","The story takes place in the year 2092,The Sup..."


In [13]:
# genre counts following cleaning 
count = Counter()
df.genres.apply(count.update)

count.most_common()

[('drama', 14211),
 ('comedy', 9773),
 ('romance', 6678),
 ('thriller', 6166),
 ('action', 6035),
 ('world cinema', 5031),
 ('crime', 4997),
 ('family', 3554),
 ('adventure', 3096),
 ('horror', 2778),
 ('short film', 2270),
 ('science fiction', 2148),
 ('mystery', 2060),
 ('fantasy', 1996),
 ('silent film', 810),
 ('documentary', 402)]

In [14]:
# genre stats
df3gencs = []
for genre in df.genres:
    c = 0
    for el in genre:
        c += 1
    df3gencs.append(c)

df3countsCounter = Counter(df3gencs)
df3countsCounter.most_common()

[(2, 12793),
 (3, 8449),
 (4, 3512),
 (5, 1034),
 (6, 254),
 (7, 39),
 (8, 6),
 (9, 1)]





## Text Preparation



In [15]:
!pip install -Uqq spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl (12.8 MB)
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.3.0
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


2022-05-08 17:57:01.036290: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2022-05-08 17:57:01.036730: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [16]:
import spacy

spacy.__version__

'3.3.0'

In [17]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

nlp.pipe_names

['tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer']

In [18]:
# looking at example summary 
df.iloc[10]['summary']

'The film is a collision of three separate stories—the journey of a Ju/\'hoansi Bushman to the end of the earth to get rid of a Coca-Cola bottle, the romance between a bumbling scientist and a schoolteacher, and a band of guerrillas on the run. Xi and his tribe of San/Bushmen relatives are living well off the land in the Kalahari Desert. They are happy because the gods have provided plenty of everything, and no one in the tribe has unfulfilled wants. One day, a glass Coca-Cola bottle is thrown out of an aeroplane and falls to earth unbroken. Initially, this strange artifact seems to be another boon from the gods—-Xi\'s people find many uses for it. But unlike anything that they have had before, there is only one bottle to go around. This exposes the tribe to a hitherto unknown phenomenon, property, and they soon find themselves experiencing things they never had before: jealousy, envy, anger, hatred, even violence. Since it has caused the tribe unhappiness on two occasions, Xi decides 

In [19]:
# collect unigrams
# case fold, stop word removal, punctuation removal  
def remove_token(token): 
  return token.text.isspace() or token.is_stop or token.is_punct


def tokenize(summ):
  return [token.lemma_.lower() for token in nlp(summ) if not remove_token(token)]


def tokenize_sentence(sentences): 
  tokenized = []
  for sent in sentences: 
    tokenized.append([token.lemma_.lower() for token in nlp(sent) if not remove_token(token)])

  return tokenized

In [20]:
import nltk 
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [22]:
import multiprocessing as mp

mp.cpu_count()

12

In [23]:
import multiprocessing as mp

with mp.Pool(mp.cpu_count() - 4) as pool:
    df['unigrams'] = df.summary.apply(tokenize)
    df['sentences'] = df.summary.apply(nltk.sent_tokenize)
    df['lem_sent'] = df.sentences.apply(tokenize_sentence)

df

Unnamed: 0,wikipedia id,movie name,genres,summary,unigrams,sentences,lem_sent
0,975900,Ghosts of Mars,"[action, adventure, thriller, horror, science ...","Set in the second half of the 22nd century, th...","[set, second, half, 22nd, century, film, depic...","[Set in the second half of the 22nd century, t...","[[set, second, half, 22nd, century, film, depi..."
3,18998739,The Sorcerer's Apprentice,"[adventure, family, world cinema, fantasy]","Every hundred years, the evil Morgana returns...","[year, evil, morgana, return, claim, fingall, ...","[Every hundred years, the evil Morgana return...","[[year, evil, morgana, return, claim, fingall,..."
4,6631279,Little city,"[drama, comedy, romance]","Adam, a San Francisco-based artist who works a...","[adam, san, francisco, base, artist, work, cab...","[Adam, a San Francisco-based artist who works ...","[[adam, san, francisco, base, artist, work, ca..."
7,11250635,The Mechanical Monsters,"[action, adventure, family, short film, scienc...",The story starts as one of the robots flies i...,"[story, start, robot, fly, scientist, secret, ...",[ The story starts as one of the robots flies ...,"[[story, start, robot, fly, scientist, secret,..."
8,77856,Mary Poppins,"[drama, comedy, family, fantasy]",The film opens with Mary Poppins perched in a...,"[film, open, mary, poppins, perch, cloud, high...",[The film opens with Mary Poppins perched in ...,"[[film, open, mary, poppins, perch, cloud, hig..."
...,...,...,...,...,...,...,...
42198,26482675,Eşrefpaşalılar,"[drama, comedy]","The film is about two friends, Tayyar , a mafi...","[film, friend, tayyar, mafia, leader, davut, n...","[The film is about two friends, Tayyar , a maf...","[[film, friend, tayyar, mafia, leader, davut, ..."
42199,23851782,The Ghost Train,"[comedy, crime, thriller]",{{plot}} The film opens with a Great Western e...,"[plot, film, open, great, western, express, sp...",[{{plot}} The film opens with a Great Western ...,"[[plot, film, open, great, western, express, s..."
42201,34980460,Knuckle,"[drama, documentary]",{{No plot}} This film follows 12 years in the ...,"[plot, film, follow, 12, year, life, 3, irish,...",[{{No plot}} This film follows 12 years in the...,"[[plot, film, follow, 12, year, life, 3, irish..."
42202,913762,The Super Dimension Fortress Macross II: Lover...,"[drama, adventure, short film, science fiction]","The story takes place in the year 2092,The Sup...","[story, take, place, year, 2092,the, super, di...","[The story takes place in the year 2092,The Su...","[[story, take, place, year, 2092,the, super, d..."


In [24]:
# example tokenization
df.loc[42199]['unigrams']

['plot',
 'film',
 'open',
 'great',
 'western',
 'express',
 'speeding',
 'box',
 'tunnel',
 'route',
 'cornwall',
 'train',
 'passenger',
 'include',
 'herbert',
 'edna',
 'young',
 'couple',
 'travel',
 'truro',
 'marry',
 'miss',
 'bourne',
 'west',
 'london',
 'spinster',
 'visit',
 'evacuated',
 'relative',
 'tommy',
 'gander',
 'overenthusiastic',
 'vaudeville',
 'style',
 'comedian',
 'entertainer',
 'head',
 'pier',
 'pavilion',
 'newquay',
 'dr',
 'sterling',
 'locum',
 'doctor',
 'redruth',
 'richard',
 'g',
 'winthrop',
 'cousin',
 'jackie',
 'head',
 'truro',
 'teddy',
 'deakin',
 'pass',
 'teignmouth',
 'communication',
 'cord',
 'pull',
 'train',
 'stop',
 'guard',
 'passenger',
 'find',
 'gander',
 'lose',
 'hat',
 'run',
 'retrieve',
 'ignorant',
 'delay',
 'cause',
 'return',
 'train',
 'come',
 'jackie',
 'winthrop',
 'compartment',
 'gander',
 'try',
 'conversation',
 'teddy',
 'eye',
 'jackie',
 'quick',
 'try',
 'send',
 'packing',
 'point',
 'insinuate',
 'gander

In [35]:
# mean number of tokens in summary 
df.unigrams.str.len().mean()

# keep obs with at least 75 tkens
df = df[df.unigrams.map(len) >= 75]


In [36]:
df_new2 = df

def get_drama_comedy_action_thriller(genres):
  for gen in genres:
    if gen=="drama":
      return gen
    elif gen=="comedy":
      return gen
    elif gen=="action":
      return gen
    elif gen=="thriller":
      return gen
    else:
      return 0

df_new["genre_new"] = df_new.genres.apply(get_drama_comedy_action_thriller)

In [43]:
df_drama = df_new2[df_new2["genre_new"] == "drama"]
df_drama_subset = df_drama.sample(500)

df_comedy = df_new2[df_new2["genre_new"] == "comedy"]
df_comedy_subset = df_comedy.sample(500)

df_action = df_new2[df_new2["genre_new"] == "action"]
df_action_subset = df_action.sample(500)

df_thriller = df_new2[df_new2["genre_new"] == "thriller"]
df_thriller_subset = df_thriller.sample(500)

df_new2 = df_new[df_new["genre_new"] != 0]

df_concat = pd.concat([df_drama_subset, df_comedy_subset, df_action_subset, df_thriller_subset])


In [44]:
def flatten_sent(sentences):
  return [" ".join(sent) for sent in sentences]

df_concat["lem_sent_join"] = df_concat.lem_sent.apply(flatten_sent)

df_concat

Unnamed: 0,wikipedia id,movie name,genres,summary,unigrams,sentences,lem_sent,genre_new,lem_sent_join
36084,431014,Big Fish,"[drama, comedy, adventure, family, fantasy]","At his son's wedding party, Edward Bloom tell...","[son, wedding, party, edward, bloom, tell, tal...","[At his son's wedding party, Edward Bloom tel...","[[son, wedding, party, edward, bloom, tell, ta...",drama,[son wedding party edward bloom tell tale tell...
16224,18057739,(500) Days of Summer,"[drama, comedy, romance]",The film is presented in a nonlinear narrative...,"[film, present, nonlinear, narrative, jump, da...",[The film is presented in a nonlinear narrativ...,"[[film, present, nonlinear, narrative, jump, d...",drama,[film present nonlinear narrative jump day 500...
24779,20475363,The Tempest,"[drama, comedy, fantasy]","Prospera, the duchess of Milan, is usurped by ...","[prospera, duchess, milan, usurp, brother, ant...","[Prospera, the duchess of Milan, is usurped by...","[[prospera, duchess, milan, usurp, brother, an...",drama,[prospera duchess milan usurp brother antonio ...
28017,17941264,Poison Ivy 4: The Secret Society,"[drama, thriller, fantasy]","Danielle ""Daisy"" Brookes is an orphan who leav...","[danielle, daisy, brookes, orphan, leave, ranc...","[Danielle ""Daisy"" Brookes is an orphan who lea...","[[danielle, daisy, brookes, orphan, leave, ran...",drama,[danielle daisy brookes orphan leave ranch boy...
5778,32930644,Blood Money,"[drama, action, thriller]",The story is about Kunal who believes he can ...,"[story, kunal, believe, try, hard, receive, jo...",[The story is about Kunal who believes he can...,"[[story, kunal, believe, try, hard], [receive,...",drama,"[story kunal believe try hard, receive job off..."
...,...,...,...,...,...,...,...,...,...
29042,3210534,Saw,"[thriller, horror, short film]","A young man, David , is in an interrogation ro...","[young, man, david, interrogation, room, talk,...","[A young man, David , is in an interrogation r...","[[young, man, david, interrogation, room, talk...",thriller,[young man david interrogation room talk unnam...
27086,6811111,The Scarecrow,"[thriller, horror, mystery]",One night a girl is slain in the woods of a sm...,"[night, girl, slay, wood, small, town, teenage...",[One night a girl is slain in the woods of a s...,"[[night, girl, slay, wood, small, town, teenag...",thriller,[night girl slay wood small town teenager sam ...
35902,11527681,ELI,"[thriller, short film, science fiction]","A man wakes up in a hospital, lying on an oper...","[man, wake, hospital, lie, operating, table, i...","[A man wakes up in a hospital, lying on an ope...","[[man, wake, hospital, lie, operating, table],...",thriller,"[man wake hospital lie operating table, idea r..."
16790,4306261,The Eye 10,"[thriller, world cinema, horror]","In Thailand, Chong-kwai welcomes his friends T...","[thailand, chong, kwai, welcome, friend, ted, ...","[In Thailand, Chong-kwai welcomes his friends ...","[[thailand, chong, kwai, welcome, friend, ted,...",thriller,[thailand chong kwai welcome friend ted kofei ...


In [45]:
# prepare df for exporting 


# drop unneeded cols 
df_concat = df_concat.drop(['wikipedia id', 'movie name', 'genres'], axis = 1).reset_index(drop = True)


# reorder cols 
df_concat = df_concat[['summary', 'unigrams', 'sentences', 'lem_sent', 'lem_sent_join', 'genre_new']]

df_concat

Unnamed: 0,summary,unigrams,sentences,lem_sent,lem_sent_join,genre_new
0,"At his son's wedding party, Edward Bloom tell...","[son, wedding, party, edward, bloom, tell, tal...","[At his son's wedding party, Edward Bloom tel...","[[son, wedding, party, edward, bloom, tell, ta...",[son wedding party edward bloom tell tale tell...,drama
1,The film is presented in a nonlinear narrative...,"[film, present, nonlinear, narrative, jump, da...",[The film is presented in a nonlinear narrativ...,"[[film, present, nonlinear, narrative, jump, d...",[film present nonlinear narrative jump day 500...,drama
2,"Prospera, the duchess of Milan, is usurped by ...","[prospera, duchess, milan, usurp, brother, ant...","[Prospera, the duchess of Milan, is usurped by...","[[prospera, duchess, milan, usurp, brother, an...",[prospera duchess milan usurp brother antonio ...,drama
3,"Danielle ""Daisy"" Brookes is an orphan who leav...","[danielle, daisy, brookes, orphan, leave, ranc...","[Danielle ""Daisy"" Brookes is an orphan who lea...","[[danielle, daisy, brookes, orphan, leave, ran...",[danielle daisy brookes orphan leave ranch boy...,drama
4,The story is about Kunal who believes he can ...,"[story, kunal, believe, try, hard, receive, jo...",[The story is about Kunal who believes he can...,"[[story, kunal, believe, try, hard], [receive,...","[story kunal believe try hard, receive job off...",drama
...,...,...,...,...,...,...
1995,"A young man, David , is in an interrogation ro...","[young, man, david, interrogation, room, talk,...","[A young man, David , is in an interrogation r...","[[young, man, david, interrogation, room, talk...",[young man david interrogation room talk unnam...,thriller
1996,One night a girl is slain in the woods of a sm...,"[night, girl, slay, wood, small, town, teenage...",[One night a girl is slain in the woods of a s...,"[[night, girl, slay, wood, small, town, teenag...",[night girl slay wood small town teenager sam ...,thriller
1997,"A man wakes up in a hospital, lying on an oper...","[man, wake, hospital, lie, operating, table, i...","[A man wakes up in a hospital, lying on an ope...","[[man, wake, hospital, lie, operating, table],...","[man wake hospital lie operating table, idea r...",thriller
1998,"In Thailand, Chong-kwai welcomes his friends T...","[thailand, chong, kwai, welcome, friend, ted, ...","[In Thailand, Chong-kwai welcomes his friends ...","[[thailand, chong, kwai, welcome, friend, ted,...",[thailand chong kwai welcome friend ted kofei ...,thriller


In [47]:
# export 
df_concat.to_csv('./processed_dataframe.csv')

In [48]:
pwd

'C:\\Users\\PC\\Downloads'