In [1]:
!pip install --upgrade spacy
!python -m spacy download en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy
  Downloading spacy-3.3.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.2 MB)
[K     |████████████████████████████████| 6.2 MB 4.3 MB/s 
Collecting catalogue<2.1.0,>=2.0.6
  Downloading catalogue-2.0.7-py3-none-any.whl (17 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0
  Downloading spacy_loggers-1.0.2-py3-none-any.whl (7.2 kB)
Collecting typer<0.5.0,>=0.3.0
  Downloading typer-0.4.1-py3-none-any.whl (27 kB)
Collecting srsly<3.0.0,>=2.4.3
  Downloading srsly-2.4.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (457 kB)
[K     |████████████████████████████████| 457 kB 47.8 MB/s 
Collecting langcodes<4.0.0,>=3.2.0
  Downloading langcodes-3.3.0-py3-none-any.whl (181 kB)
[K     |████████████████████████████████| 181 kB 44.6 MB/s 
Collecting typing-extensions<4.2.0,>=3.7.4
  Downloading typing_extensions-4.1.1-py3-none-any.whl (26 kB)
Collecting thinc<

In [66]:
import spacy
import re
nlp = spacy.load('en_core_web_sm')

In [67]:
text = """The story starts with a flashback in which two school boys, Vinod Nair and his friend, are at the Thalassery pier. Vinod Nair sees a Muslim girl who he finds attractive and he prays to God to make her his wife in the future.
          Then movie shifts focus to the present, in which Vinod, hailing from a middle-class Hindu Nair family, is jailed for trespassing the property of a rich Muslim politician, Abdul Khader. Realizing that he went there to meet the Muslim girl, 
          Aisha, the politician's niece with whom he's madly in love, Sub-inspector Prem Kumar and his men request him to tell his love story.
          From here, the movie goes back in time by a few months, when Vinod first saw Aisha while attending his friend's marriage. He accidentally collides with her while running along a corridor and she falls down a staircase, 
          ending up unconscious in a hospital. Vinod goes to the hospital and finds a little girl beside Aisha's bed. With her help, he leaves a 'sorry' letter to Aisha, written using a stencil. Then with his close friends Abdu and Mustafa, Vinod tries to win Aisha's heart.
          To get a chance to talk to Aisha, Vinod next tries to participate in a Daffmuttu competition in Kannur University Cultural Festival in which Aisha also is participating. He goes to a trainer Najaf. But Vinod and Najaf have a history of bitterness when during his 12th grade, 
          the duo had a tiff. The storyline goes into further flashback mode, to a few years prior.Vinod along with his friends back then, Nijad and Majid had a gang, named Smartboyz. Majid was in love with Fatima, the daughter of the local shrimp merchant. But Fatima was in love with Najaf. 
          Thus the trio decides to take on Najaf and beat him by covering faces. After beating Najaf, in an act of heroism, Majid accidentally threw away his driving license along with some money. The driving license gives his credentials away, and the next day all three are beaten up by Najaf's friends."""

In [68]:
doc = nlp(text)

In [69]:
def clean(doc):
  tokens = []
  for token in doc:
    if not(token.like_num or token.is_currency):
      tokens.append(token)

  tokens = [token for token in tokens if not (token.is_punct or token.is_space or token.is_bracket or token.is_quote)]
  tokens = [token for token in tokens if not token.is_stop]
  tokens = [token for token in tokens if token.text.strip()!=""]
  filtered = " ".join([token.lemma_ for token in tokens])
  filtered = re.sub(r"[^a-zA-Z\']", " ", filtered)
  filtered = re.sub(r"[^\x00-\x7F]+", "", filtered)
  filtered = filtered.lower()
  return filtered

In [70]:
cleaned = clean(doc)
cleaned = [cleaned] #converting the raw text into list containing the raw text because the function Countvectorizer expects a list of strings.

In [71]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(cleaned)

In [72]:
vectorizer.get_feature_names() #function to get the feature names that is all the seperate words 



['abdu',
 'abdul',
 'accidentally',
 'act',
 'aisha',
 'attend',
 'attractive',
 'away',
 'beat',
 'bed',
 'bitterness',
 'boy',
 'chance',
 'class',
 'close',
 'collide',
 'competition',
 'corridor',
 'cover',
 'credential',
 'cultural',
 'daffmuttu',
 'daughter',
 'day',
 'decide',
 'drive',
 'duo',
 'end',
 'face',
 'fall',
 'family',
 'fatima',
 'festival',
 'find',
 'flashback',
 'focus',
 'friend',
 'future',
 'gang',
 'girl',
 'give',
 'go',
 'god',
 'grade',
 'hail',
 'heart',
 'help',
 'heroism',
 'hindu',
 'history',
 'hospital',
 'inspector',
 'jail',
 'kannur',
 'khader',
 'kumar',
 'leave',
 'letter',
 'license',
 'little',
 'local',
 'love',
 'madly',
 'majid',
 'man',
 'marriage',
 'meet',
 'merchant',
 'middle',
 'mode',
 'money',
 'month',
 'movie',
 'muslim',
 'mustafa',
 'nair',
 'najaf',
 'name',
 'niece',
 'nijad',
 'participate',
 'pier',
 'politician',
 'pray',
 'prem',
 'present',
 'prior',
 'property',
 'realize',
 'request',
 'rich',
 'run',
 'school',
 'see',

In [73]:
print(X.toarray()) #number denotes the count of each words


[[1 1 2 1 7 1 1 2 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 2 1 2 2 1
  5 1 1 3 1 5 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 4 1 3 1 1 1 1 1 1 1 1
  2 3 1 3 6 1 1 1 2 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1
  1 1 1 1 1 2 1 1 9 1 1 1 1]]


In [74]:
import pandas as pd
pd. set_option("display.max_columns", None) 
df_bow = pd.DataFrame(X.toarray(),columns = vectorizer.get_feature_names_out()) #putting it in a pandas dataframe so that it can be viewed properly and during model building this is how the texts are transformed and passed into......

In [75]:
df_bow.head()

Unnamed: 0,abdu,abdul,accidentally,act,aisha,attend,attractive,away,beat,bed,bitterness,boy,chance,class,close,collide,competition,corridor,cover,credential,cultural,daffmuttu,daughter,day,decide,drive,duo,end,face,fall,family,fatima,festival,find,flashback,focus,friend,future,gang,girl,give,go,god,grade,hail,heart,help,heroism,hindu,history,hospital,inspector,jail,kannur,khader,kumar,leave,letter,license,little,local,love,madly,majid,man,marriage,meet,merchant,middle,mode,money,month,movie,muslim,mustafa,nair,najaf,name,niece,nijad,participate,pier,politician,pray,prem,present,prior,property,realize,request,rich,run,school,see,shift,shrimp,smartboyz,sorry,staircase,start,stencil,story,storyline,sub,talk,tell,thalassery,throw,tiff,time,trainer,trespass,trio,try,unconscious,university,vinod,wife,win,write,year
0,1,1,2,1,7,1,1,2,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,2,1,2,2,1,5,1,1,3,1,5,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,2,1,1,4,1,3,1,1,1,1,1,1,1,1,2,3,1,3,6,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,2,1,1,9,1,1,1,1


In [76]:
a = ['spiderman great fly','superman great fly','batman super fight','flash fast run','ironman super brain']

In [77]:
vectorizer1 = CountVectorizer(min_df=2) #min_df=2 denotes that if a particular word appears in less than 2 documents ignote that word
X1 = vectorizer1.fit_transform(a)

import pandas as pd
pd. set_option("display.max_columns", None) 
df_bow1 = pd.DataFrame(X1.toarray(),columns = vectorizer1.get_feature_names_out()) #putting it in a pandas dataframe so that it can be viewed properly and during model building this is how the texts are transformed and passed into......

In [78]:
df_bow1.head() #only great,fly,super have df(document frequency) >= 2

Unnamed: 0,fly,great,super
0,1,1,0
1,1,1,0
2,0,0,1
3,0,0,0
4,0,0,1


In [79]:
b = ['captain america super handsome','hulk great strength','spiderman great fly','superman great fly','batman super fight','flash fast run','ironman super brain']

In [80]:
vectorizer2 = CountVectorizer(max_df=2) #max_df=2 denotes that if a particular word appears in more than 2 documents ignote that word
X2 = vectorizer2.fit_transform(b)

import pandas as pd
pd. set_option("display.max_columns", None) 
df_bow2 = pd.DataFrame(X2.toarray(),columns = vectorizer2.get_feature_names_out()) #putting it in a pandas dataframe so that it can be viewed properly and during model building this is how the texts are transformed and passed into......

In [81]:
df_bow2.head() #words super and great are ignored as df>2

Unnamed: 0,america,batman,brain,captain,fast,fight,flash,fly,handsome,hulk,ironman,run,spiderman,strength,superman
0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
2,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
4,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0
