In [353]:
import pandas as pd
import nltk
import re

In [354]:
#StopWords
from nltk.corpus import stopwords

In [355]:
#Lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Moderna Articles

### Reading the data

In [356]:
moderna_df = pd.read_csv('Moderna Articles.csv')

In [357]:
#Providing Label
moderna_df['Label'] = 'Moderna'

In [358]:
moderna_df.head()

Unnamed: 0,DOC,NEWSPAPER,Label
0,DETROIT/LOS ANGELES: Distribution of Moderna I...,TOI,Moderna
1,Moderna Inc said on Monday it was informed by ...,TOI,Moderna
2,NEW YORK: Moderna has begun testing its COVID-...,TOI,Moderna
3,Moderna Inc on Thursday received approval from...,TOI,Moderna
4,???,TOI,Moderna


### Dropping records that contain ???

In [359]:
moderna_df = moderna_df[moderna_df['DOC'] != '???']

In [360]:
moderna_df.head()

Unnamed: 0,DOC,NEWSPAPER,Label
0,DETROIT/LOS ANGELES: Distribution of Moderna I...,TOI,Moderna
1,Moderna Inc said on Monday it was informed by ...,TOI,Moderna
2,NEW YORK: Moderna has begun testing its COVID-...,TOI,Moderna
3,Moderna Inc on Thursday received approval from...,TOI,Moderna
5,WASHINGTON: US manufacturer Moderna on Tuesday...,TOI,Moderna


### Dropping records that contain '###'

In [361]:
moderna_df = moderna_df[moderna_df['DOC'] != '###']

In [362]:
moderna_df.head()

Unnamed: 0,DOC,NEWSPAPER,Label
0,DETROIT/LOS ANGELES: Distribution of Moderna I...,TOI,Moderna
1,Moderna Inc said on Monday it was informed by ...,TOI,Moderna
2,NEW YORK: Moderna has begun testing its COVID-...,TOI,Moderna
3,Moderna Inc on Thursday received approval from...,TOI,Moderna
5,WASHINGTON: US manufacturer Moderna on Tuesday...,TOI,Moderna


### Dropping Empty Records

In [363]:
moderna_df.isnull().sum()

DOC          23
NEWSPAPER     5
Label         0
dtype: int64

In [364]:
moderna_df.dropna(inplace=True)

### Dropping Duplicates

In [365]:
#Dropping duplicates based on DOC column
moderna_df.drop_duplicates(subset=['DOC'],inplace=True)

### Resetting Index of Dataframe

In [366]:
moderna_df.reset_index(inplace=True)

In [367]:
moderna_df.head()

Unnamed: 0,index,DOC,NEWSPAPER,Label
0,0,DETROIT/LOS ANGELES: Distribution of Moderna I...,TOI,Moderna
1,1,Moderna Inc said on Monday it was informed by ...,TOI,Moderna
2,2,NEW YORK: Moderna has begun testing its COVID-...,TOI,Moderna
3,3,Moderna Inc on Thursday received approval from...,TOI,Moderna
4,5,WASHINGTON: US manufacturer Moderna on Tuesday...,TOI,Moderna


In [368]:
#Later dropping the index column that got created after reset_index
moderna_df.drop(['index'],axis=1,inplace=True)

In [369]:
moderna_df.head()

Unnamed: 0,DOC,NEWSPAPER,Label
0,DETROIT/LOS ANGELES: Distribution of Moderna I...,TOI,Moderna
1,Moderna Inc said on Monday it was informed by ...,TOI,Moderna
2,NEW YORK: Moderna has begun testing its COVID-...,TOI,Moderna
3,Moderna Inc on Thursday received approval from...,TOI,Moderna
4,WASHINGTON: US manufacturer Moderna on Tuesday...,TOI,Moderna


In [370]:
#Shape of dataset
moderna_df.shape

(833, 3)

## Cleaning Article

In [371]:
def clean_text(articles):
    '''
    Function helps to clean article by removing everything except Alphabets and digits
    and returns the clean article
    '''
    cleaned_articles = []
    for i in range(len(articles)):
        #Replacing all words except (aplhabets) with empty space
        words = re.sub('[^a-zA-Z]',' ',articles[i])
        #Lowering the text
        words = words.lower()
        #Splitting into words
        words = words.split()
        #Lemmatizing the words
        words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
        #After lemmatization, again joining the words
        words = ' '.join(words)

        cleaned_articles.append(words)
    return cleaned_articles

### Getting the cleaned articles

In [372]:
articles = moderna_df['DOC']

clean_articles = clean_text(articles)
clean_articles

['detroit los angeles distribution moderna inc covid vaccine location united state begun vastly widening rollout started last week pfizer inc u army general gustave perna said saturday moderna already moved vaccine manufacturing plant warehouse operated distributor mckesson corp packed container loaded truck saturday perna said news conference truck set sunday shipment start reaching healthcare provider soon monday said food drug administration friday approved emergency use authorization moderna vaccine second covid vaccine approved jab developed pfizer german partner biontech se approved dec worker pharmaceutical service provider catalent inc facility bloomington indiana filling packaging vial moderna vaccine handing mckesson ship dos facility including louisville kentucky memphis tennessee location close air hub united parcel service inc fedex corp start delivery moderna vaccine significantly widen availability covid vaccine u death related respiratory virus set record footrace vacci

In [373]:
len(clean_articles)

833

### Joining in the dataframe as "clean_article"

In [374]:
moderna_df['Clean_Article'] = clean_articles

In [375]:
moderna_df.head()

Unnamed: 0,DOC,NEWSPAPER,Label,Clean_Article
0,DETROIT/LOS ANGELES: Distribution of Moderna I...,TOI,Moderna,detroit los angeles distribution moderna inc c...
1,Moderna Inc said on Monday it was informed by ...,TOI,Moderna,moderna inc said monday informed european medi...
2,NEW YORK: Moderna has begun testing its COVID-...,TOI,Moderna,new york moderna begun testing covid vaccine c...
3,Moderna Inc on Thursday received approval from...,TOI,Moderna,moderna inc thursday received approval u food ...
4,WASHINGTON: US manufacturer Moderna on Tuesday...,TOI,Moderna,washington u manufacturer moderna tuesday said...


In [376]:
moderna_df.to_csv('Moderna Clean Articles.csv',index=False)

# Pfizer Articles

### Reading the data

In [377]:
pfizer_df = pd.read_csv('Pfizer Articles.csv')

In [378]:
pfizer_df['Label'] = 'Pfizer'

In [379]:
pfizer_df.head()

Unnamed: 0,DOC,NEWSPAPER,Label
0,Pfizer announced Wednesday that its Covid-19 v...,TOI,Pfizer
1,Pfizer Inc and partner BioNTech SE said on Fri...,TOI,Pfizer
2,NEW DELHI/MUMBAI: The Indian drug regulator's ...,TOI,Pfizer
3,SEOUL: North Korean hackers tried to break int...,TOI,Pfizer
4,NEW DELHI: In a historic moment in the fight a...,TOI,Pfizer


### Dropping records that contain ???

In [380]:
pfizer_df = pfizer_df[pfizer_df['DOC'] != '???']

In [381]:
pfizer_df.head()

Unnamed: 0,DOC,NEWSPAPER,Label
0,Pfizer announced Wednesday that its Covid-19 v...,TOI,Pfizer
1,Pfizer Inc and partner BioNTech SE said on Fri...,TOI,Pfizer
2,NEW DELHI/MUMBAI: The Indian drug regulator's ...,TOI,Pfizer
3,SEOUL: North Korean hackers tried to break int...,TOI,Pfizer
4,NEW DELHI: In a historic moment in the fight a...,TOI,Pfizer


### Dropping records that contain '###'

In [382]:
pfizer_df = pfizer_df[pfizer_df['DOC'] != '###']

In [383]:
pfizer_df.head()

Unnamed: 0,DOC,NEWSPAPER,Label
0,Pfizer announced Wednesday that its Covid-19 v...,TOI,Pfizer
1,Pfizer Inc and partner BioNTech SE said on Fri...,TOI,Pfizer
2,NEW DELHI/MUMBAI: The Indian drug regulator's ...,TOI,Pfizer
3,SEOUL: North Korean hackers tried to break int...,TOI,Pfizer
4,NEW DELHI: In a historic moment in the fight a...,TOI,Pfizer


### Dropping records that contain <

In [384]:
pfizer_df = pfizer_df[pfizer_df['DOC'] != '>']

In [385]:
pfizer_df.head()

Unnamed: 0,DOC,NEWSPAPER,Label
0,Pfizer announced Wednesday that its Covid-19 v...,TOI,Pfizer
1,Pfizer Inc and partner BioNTech SE said on Fri...,TOI,Pfizer
2,NEW DELHI/MUMBAI: The Indian drug regulator's ...,TOI,Pfizer
3,SEOUL: North Korean hackers tried to break int...,TOI,Pfizer
4,NEW DELHI: In a historic moment in the fight a...,TOI,Pfizer


### Dropping Duplicates

In [386]:
#Dropping duplicates based on DOC column
pfizer_df.drop_duplicates(subset=['DOC'],inplace=True)

### Dropping Empty Records

In [387]:
pfizer_df.isnull().sum()

DOC          1
NEWSPAPER    0
Label        0
dtype: int64

In [388]:
pfizer_df.dropna(inplace=True)

### Resetting Index of Dataframe

In [389]:
pfizer_df.reset_index(inplace=True)

In [390]:
pfizer_df.head()

Unnamed: 0,index,DOC,NEWSPAPER,Label
0,0,Pfizer announced Wednesday that its Covid-19 v...,TOI,Pfizer
1,1,Pfizer Inc and partner BioNTech SE said on Fri...,TOI,Pfizer
2,2,NEW DELHI/MUMBAI: The Indian drug regulator's ...,TOI,Pfizer
3,3,SEOUL: North Korean hackers tried to break int...,TOI,Pfizer
4,4,NEW DELHI: In a historic moment in the fight a...,TOI,Pfizer


In [391]:
#Later dropping the index column that got created after reset_index
pfizer_df.drop(['index'],axis=1,inplace=True)

In [392]:
pfizer_df.head()

Unnamed: 0,DOC,NEWSPAPER,Label
0,Pfizer announced Wednesday that its Covid-19 v...,TOI,Pfizer
1,Pfizer Inc and partner BioNTech SE said on Fri...,TOI,Pfizer
2,NEW DELHI/MUMBAI: The Indian drug regulator's ...,TOI,Pfizer
3,SEOUL: North Korean hackers tried to break int...,TOI,Pfizer
4,NEW DELHI: In a historic moment in the fight a...,TOI,Pfizer


In [393]:
#Shape of dataset
pfizer_df.shape

(957, 3)

### Getting the cleaned articles

In [394]:
articles = pfizer_df['DOC']

clean_articles = clean_text(articles)
clean_articles

['pfizer announced wednesday covid vaccine safe strongly protective kid young step toward possibly beginning shot age group head back school fall covid vaccine rolled worldwide adult higher risk coronavirus pfizer vaccine authorized age older vaccinating child age critical stopping pandemic helping school least upper grade start look little normal month disruption study u volunteer age preliminary data showed case covid among fully vaccinated adolescent compared among given dummy shot pfizer reported small study yet published another important piece evidence well shot revved kid immune system researcher reported high level virus fighting antibody somewhat higher seen study young adult kid side effect similar young adult company said main side effect pain fever chill fatigue particularly second dose study continue track participant two year information long term protection safety pfizer german partner biontech coming week plan ask u food drug administration european regulator allow emer

In [395]:
len(clean_articles)

957

### Joining in the dataframe as "clean_article"

In [396]:
pfizer_df['Clean_Article'] = clean_articles

In [397]:
pfizer_df.head()

Unnamed: 0,DOC,NEWSPAPER,Label,Clean_Article
0,Pfizer announced Wednesday that its Covid-19 v...,TOI,Pfizer,pfizer announced wednesday covid vaccine safe ...
1,Pfizer Inc and partner BioNTech SE said on Fri...,TOI,Pfizer,pfizer inc partner biontech se said friday sub...
2,NEW DELHI/MUMBAI: The Indian drug regulator's ...,TOI,Pfizer,new delhi mumbai indian drug regulator subject...
3,SEOUL: North Korean hackers tried to break int...,TOI,Pfizer,seoul north korean hacker tried break computer...
4,NEW DELHI: In a historic moment in the fight a...,TOI,Pfizer,new delhi historic moment fight pandemic brita...


In [398]:
pfizer_df.isnull().sum()

DOC              0
NEWSPAPER        0
Label            0
Clean_Article    0
dtype: int64

In [399]:
pfizer_df.to_csv('Pfizer Clean Articles.csv',index=False)

# Covishield Articles

### Reading the data

In [400]:
covishield_df = pd.read_excel('Covishield Articles.xlsx')

In [401]:
covishield_df['NEWSPAPER'] = 'TOI'
covishield_df['Label'] = 'Covishield'

In [402]:
covishield_df.head()

Unnamed: 0,DOC,NEWSPAPER,Label
0,CHENNAI: The Tamil Nadu government doctors ass...,TOI,Covishield
1,MANGALURU: Dakshina Kannada district is fully ...,TOI,Covishield
2,CHANDIGARH: The wait is finally over. Punjab o...,TOI,Covishield
3,MUMBAI: India may have decided to stick to the...,TOI,Covishield
4,PUNE: Ahead of the vaccination drive against c...,TOI,Covishield


In [403]:
covishield_df.shape

(400, 3)

### Dropping records that contain ???

In [404]:
covishield_df = covishield_df[covishield_df['DOC'] != '???']

In [405]:
covishield_df.head()

Unnamed: 0,DOC,NEWSPAPER,Label
0,CHENNAI: The Tamil Nadu government doctors ass...,TOI,Covishield
1,MANGALURU: Dakshina Kannada district is fully ...,TOI,Covishield
2,CHANDIGARH: The wait is finally over. Punjab o...,TOI,Covishield
3,MUMBAI: India may have decided to stick to the...,TOI,Covishield
4,PUNE: Ahead of the vaccination drive against c...,TOI,Covishield


### Dropping records that contain '###'

In [406]:
covishield_df = covishield_df[covishield_df['DOC'] != '###']

In [407]:
covishield_df.head()

Unnamed: 0,DOC,NEWSPAPER,Label
0,CHENNAI: The Tamil Nadu government doctors ass...,TOI,Covishield
1,MANGALURU: Dakshina Kannada district is fully ...,TOI,Covishield
2,CHANDIGARH: The wait is finally over. Punjab o...,TOI,Covishield
3,MUMBAI: India may have decided to stick to the...,TOI,Covishield
4,PUNE: Ahead of the vaccination drive against c...,TOI,Covishield


### Dropping Duplicates

In [408]:
#Dropping duplicates based on DOC column
pfizer_df.drop_duplicates(subset=['DOC'],inplace=True)

### Dropping Empty Records

In [409]:
covishield_df.isnull().sum()

DOC          0
NEWSPAPER    0
Label        0
dtype: int64

### Resetting Index of Dataframe

In [410]:
covishield_df.reset_index(inplace=True)

In [411]:
covishield_df.head()

Unnamed: 0,index,DOC,NEWSPAPER,Label
0,0,CHENNAI: The Tamil Nadu government doctors ass...,TOI,Covishield
1,1,MANGALURU: Dakshina Kannada district is fully ...,TOI,Covishield
2,2,CHANDIGARH: The wait is finally over. Punjab o...,TOI,Covishield
3,3,MUMBAI: India may have decided to stick to the...,TOI,Covishield
4,4,PUNE: Ahead of the vaccination drive against c...,TOI,Covishield


In [412]:
#Later dropping the index column that got created after reset_index
covishield_df.drop(['index'],axis=1,inplace=True)

In [413]:
covishield_df.head()

Unnamed: 0,DOC,NEWSPAPER,Label
0,CHENNAI: The Tamil Nadu government doctors ass...,TOI,Covishield
1,MANGALURU: Dakshina Kannada district is fully ...,TOI,Covishield
2,CHANDIGARH: The wait is finally over. Punjab o...,TOI,Covishield
3,MUMBAI: India may have decided to stick to the...,TOI,Covishield
4,PUNE: Ahead of the vaccination drive against c...,TOI,Covishield


In [414]:
#Shape of dataset
covishield_df.shape

(384, 3)

### Getting the cleaned articles

In [415]:
articles = covishield_df['DOC']

clean_articles = clean_text(articles)
clean_articles

['chennai tamil nadu government doctor association tngda advised member opt covishield vaccine manufactured serum institute india vector based vaccine use nearly two month without major side effect reported vaccine approved emergency use bharat biotech covaxin last week government doctor association said would put advisory doctor based opinion three independent specialist safety efficacy choice vaccine offered thursday night circular association urged doctor except contraindication take vaccine doctor healthcare provider high risk contracting covid daily routine covisheild completed phase human trial published result hand covaxin yet complete phase trial come result covishield shown good immunogenicity said although union health ministry said beneficiary option choose two vaccine association asked doctor insist covisheild public health official said vaccination done covisheild state redistributed vaccine district vaccination programme scheduled begin january state received lakh dos vac

In [416]:
len(clean_articles)

384

### Joining in the dataframe as "clean_article"

In [417]:
covishield_df['Clean_Article'] = clean_articles

In [418]:
covishield_df.head()

Unnamed: 0,DOC,NEWSPAPER,Label,Clean_Article
0,CHENNAI: The Tamil Nadu government doctors ass...,TOI,Covishield,chennai tamil nadu government doctor associati...
1,MANGALURU: Dakshina Kannada district is fully ...,TOI,Covishield,mangaluru dakshina kannada district fully prep...
2,CHANDIGARH: The wait is finally over. Punjab o...,TOI,Covishield,chandigarh wait finally punjab tuesday receive...
3,MUMBAI: India may have decided to stick to the...,TOI,Covishield,mumbai india may decided stick day gap two sho...
4,PUNE: Ahead of the vaccination drive against c...,TOI,Covishield,pune ahead vaccination drive coronavirus total...


In [419]:
covishield_df.isnull().sum()

DOC              0
NEWSPAPER        0
Label            0
Clean_Article    0
dtype: int64

In [420]:
covishield_df.to_csv('Covishield Clean Articles.csv',index=False)

# Covaxin Articles

### Reading the data

In [421]:
covaxin_df = pd.read_excel('Covaxin Articles.xlsx')

In [422]:
covaxin_df['NEWSPAPER'] = 'TOI'
covaxin_df['Label'] = 'Covaxin'

In [423]:
covaxin_df.head()

Unnamed: 0,DOC,NEWSPAPER,Label
0,RAIPUR: Covaxin has breached the final frontie...,TOI,Covaxin
1,NEW DELHI: Close on the heels of Brazil sealin...,TOI,Covaxin
2,NEW DELHI: The government said regular emergen...,TOI,Covaxin
3,HYDERABAD: India’s first indigenous Covid-19 v...,TOI,Covaxin
4,MUMBAI: The Maharashtra government on Wednesda...,TOI,Covaxin


In [424]:
covaxin_df.shape

(920, 3)

### Dropping records that contain ???

In [425]:
covaxin_df = covaxin_df[covaxin_df['DOC'] != '???']

In [426]:
covaxin_df.head()

Unnamed: 0,DOC,NEWSPAPER,Label
0,RAIPUR: Covaxin has breached the final frontie...,TOI,Covaxin
1,NEW DELHI: Close on the heels of Brazil sealin...,TOI,Covaxin
2,NEW DELHI: The government said regular emergen...,TOI,Covaxin
3,HYDERABAD: India’s first indigenous Covid-19 v...,TOI,Covaxin
4,MUMBAI: The Maharashtra government on Wednesda...,TOI,Covaxin


### Dropping records that contain '###'

In [427]:
covaxin_df = covaxin_df[covaxin_df['DOC'] != '###']

In [428]:
covaxin_df.head()

Unnamed: 0,DOC,NEWSPAPER,Label
0,RAIPUR: Covaxin has breached the final frontie...,TOI,Covaxin
1,NEW DELHI: Close on the heels of Brazil sealin...,TOI,Covaxin
2,NEW DELHI: The government said regular emergen...,TOI,Covaxin
3,HYDERABAD: India’s first indigenous Covid-19 v...,TOI,Covaxin
4,MUMBAI: The Maharashtra government on Wednesda...,TOI,Covaxin


In [429]:
covaxin_df.shape

(877, 3)

### Dropping Duplicates

In [430]:
#Dropping duplicates based on DOC column
pfizer_df.drop_duplicates(subset=['DOC'],inplace=True)

### Dropping Empty Records

In [431]:
covaxin_df.isnull().sum()

DOC          0
NEWSPAPER    0
Label        0
dtype: int64

### Resetting Index of Dataframe

In [432]:
covaxin_df.reset_index(inplace=True)

In [433]:
covaxin_df.head()

Unnamed: 0,index,DOC,NEWSPAPER,Label
0,0,RAIPUR: Covaxin has breached the final frontie...,TOI,Covaxin
1,1,NEW DELHI: Close on the heels of Brazil sealin...,TOI,Covaxin
2,2,NEW DELHI: The government said regular emergen...,TOI,Covaxin
3,3,HYDERABAD: India’s first indigenous Covid-19 v...,TOI,Covaxin
4,4,MUMBAI: The Maharashtra government on Wednesda...,TOI,Covaxin


In [434]:
#Later dropping the index column that got created after reset_index
covaxin_df.drop(['index'],axis=1,inplace=True)

In [435]:
covaxin_df.head()

Unnamed: 0,DOC,NEWSPAPER,Label
0,RAIPUR: Covaxin has breached the final frontie...,TOI,Covaxin
1,NEW DELHI: Close on the heels of Brazil sealin...,TOI,Covaxin
2,NEW DELHI: The government said regular emergen...,TOI,Covaxin
3,HYDERABAD: India’s first indigenous Covid-19 v...,TOI,Covaxin
4,MUMBAI: The Maharashtra government on Wednesda...,TOI,Covaxin


In [436]:
#Shape of dataset
covaxin_df.shape

(877, 3)

### Getting the cleaned articles

In [437]:
articles = covaxin_df['DOC']

clean_articles = clean_text(articles)
clean_articles

['raipur covaxin breached final frontier chhattisgarh health minister t singh deo refused use state clear human trial said take covaxin recovers covid booster shot covaxin came singh deo proud swadeshi vaccine firmly believe due process need followed per international norm put general use tweeted singh deo dcgi upgraded covaxin clinical trial phase although final data phase iii trial yet published considering allow covaxin wish take deo added chhattisgarh received dos covaxin recently deo said covaxin use optional chhattisgarh initially allowed government medical college district hospital countering opposition allegation said want remind everyone health people top priority state health department said health worker frontline worker received first dose covid vaccine tnn',
 'new delhi close heel brazil sealing deal secure million dos covaxin india first indigenous covid vaccine zimbabwe authorised use report swati bharadwaj zimbabwe becomes first african country authorise use two dose wh

In [438]:
len(clean_articles)

877

### Joining in the dataframe as "clean_article"

In [439]:
covaxin_df['Clean_Article'] = clean_articles

In [440]:
covaxin_df.head()

Unnamed: 0,DOC,NEWSPAPER,Label,Clean_Article
0,RAIPUR: Covaxin has breached the final frontie...,TOI,Covaxin,raipur covaxin breached final frontier chhatti...
1,NEW DELHI: Close on the heels of Brazil sealin...,TOI,Covaxin,new delhi close heel brazil sealing deal secur...
2,NEW DELHI: The government said regular emergen...,TOI,Covaxin,new delhi government said regular emergency us...
3,HYDERABAD: India’s first indigenous Covid-19 v...,TOI,Covaxin,hyderabad india first indigenous covid vaccine...
4,MUMBAI: The Maharashtra government on Wednesda...,TOI,Covaxin,mumbai maharashtra government wednesday raised...


In [441]:
covaxin_df.to_csv('Covaxin Clean Articles.csv',index=False)

# Sputnik Articles

### Reading the data

In [442]:
sputnik_df = pd.read_excel('Sputnik Articles.xlsx')

In [443]:
sputnik_df['NEWSPAPER'] = 'TOI'
sputnik_df['Label'] = 'Sputnik'

In [444]:
sputnik_df.head()

Unnamed: 0,DOC,NEWSPAPER,Label
0,HYDERABAD: Following the independent Data and ...,TOI,Sputnik
1,Swati.Bharadwaj@timesgroup.com Covid brought n...,TOI,Sputnik
2,NEW DELHI: Russian envoy in New Delhi Nikolay ...,TOI,Sputnik
3,MOSCOW: Russia's boast in August that it was t...,TOI,Sputnik
4,NEW DELHI: It may be some time before Russia’s...,TOI,Sputnik


In [445]:
sputnik_df.shape

(220, 3)

### Dropping records that contain ???

In [446]:
sputnik_df = sputnik_df[sputnik_df['DOC'] != '???']

In [447]:
sputnik_df.head()

Unnamed: 0,DOC,NEWSPAPER,Label
0,HYDERABAD: Following the independent Data and ...,TOI,Sputnik
1,Swati.Bharadwaj@timesgroup.com Covid brought n...,TOI,Sputnik
2,NEW DELHI: Russian envoy in New Delhi Nikolay ...,TOI,Sputnik
3,MOSCOW: Russia's boast in August that it was t...,TOI,Sputnik
4,NEW DELHI: It may be some time before Russia’s...,TOI,Sputnik


### Dropping records that contain '###'

In [448]:
sputnik_df = sputnik_df[sputnik_df['DOC'] != '###']

In [449]:
sputnik_df.head()

Unnamed: 0,DOC,NEWSPAPER,Label
0,HYDERABAD: Following the independent Data and ...,TOI,Sputnik
1,Swati.Bharadwaj@timesgroup.com Covid brought n...,TOI,Sputnik
2,NEW DELHI: Russian envoy in New Delhi Nikolay ...,TOI,Sputnik
3,MOSCOW: Russia's boast in August that it was t...,TOI,Sputnik
4,NEW DELHI: It may be some time before Russia’s...,TOI,Sputnik


In [450]:
sputnik_df.shape

(187, 3)

### Dropping Duplicates

In [451]:
#Dropping duplicates based on DOC column
pfizer_df.drop_duplicates(subset=['DOC'],inplace=True)

### Dropping Empty Records

In [452]:
sputnik_df.isnull().sum()

DOC          0
NEWSPAPER    0
Label        0
dtype: int64

### Resetting Index of Dataframe

In [453]:
sputnik_df.reset_index(inplace=True)

In [454]:
sputnik_df.head()

Unnamed: 0,index,DOC,NEWSPAPER,Label
0,0,HYDERABAD: Following the independent Data and ...,TOI,Sputnik
1,1,Swati.Bharadwaj@timesgroup.com Covid brought n...,TOI,Sputnik
2,2,NEW DELHI: Russian envoy in New Delhi Nikolay ...,TOI,Sputnik
3,3,MOSCOW: Russia's boast in August that it was t...,TOI,Sputnik
4,4,NEW DELHI: It may be some time before Russia’s...,TOI,Sputnik


In [455]:
#Later dropping the index column that got created after reset_index
sputnik_df.drop(['index'],axis=1,inplace=True)

In [456]:
sputnik_df.head()

Unnamed: 0,DOC,NEWSPAPER,Label
0,HYDERABAD: Following the independent Data and ...,TOI,Sputnik
1,Swati.Bharadwaj@timesgroup.com Covid brought n...,TOI,Sputnik
2,NEW DELHI: Russian envoy in New Delhi Nikolay ...,TOI,Sputnik
3,MOSCOW: Russia's boast in August that it was t...,TOI,Sputnik
4,NEW DELHI: It may be some time before Russia’s...,TOI,Sputnik


In [457]:
#Shape of dataset
sputnik_df.shape

(187, 3)

### Getting the cleaned articles

In [458]:
articles = sputnik_df['DOC']

clean_articles = clean_text(articles)
clean_articles

['hyderabad following independent data safety monitoring board dsmb review phase clinical trial russian covid vaccine sputnik v dr reddy laboratory submitted data drug controller general india sought approval continue phase trial dr reddy said monday dsmb concluded safety concern identified study met primary endpoint safety dsmb reviewed safety data phase clinical trial russian covid vaccine sputnik v recommended phase recruitment continuation trial without modification dr reddy laboratory safety data submitted drug controller general india dcgi review approval continue phase clinical trial added sputnik phase study conducted subject part randomized double blind parallel group placebo controlled study india dr reddy lab partnered russian sovereign wealth fund rdif conduct phase trial vaccine also distribute million dos india indian clinical trial conducted dr reddy rdif adaptive design phase trial bridging study larger global phase study subject phase study india showed good safety pro

In [459]:
len(clean_articles)

187

### Joining in the dataframe as "clean_article"

In [460]:
sputnik_df['Clean_Article'] = clean_articles

In [461]:
sputnik_df.head()

Unnamed: 0,DOC,NEWSPAPER,Label,Clean_Article
0,HYDERABAD: Following the independent Data and ...,TOI,Sputnik,hyderabad following independent data safety mo...
1,Swati.Bharadwaj@timesgroup.com Covid brought n...,TOI,Sputnik,swati bharadwaj timesgroup com covid brought u...
2,NEW DELHI: Russian envoy in New Delhi Nikolay ...,TOI,Sputnik,new delhi russian envoy new delhi nikolay kuda...
3,MOSCOW: Russia's boast in August that it was t...,TOI,Sputnik,moscow russia boast august first country autho...
4,NEW DELHI: It may be some time before Russia’s...,TOI,Sputnik,new delhi may time russia covid vaccine sputni...


In [462]:
sputnik_df.to_csv('Sputnik Clean Articles.csv',index=False)