In [None]:
pip install wordsegment



In [None]:
import nltk
import re
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from wordsegment import segment, load


In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Loading Data

In [None]:
df = pd.read_csv("data_to_clean.csv")
df.head()


Unnamed: 0.1,Unnamed: 0,Tweet_ID,full_text,Sentiment_Label
0,0,1353493911822454784,$ENZC Huge week ahead of us. We will set new h...,negative
1,1,1353539100448591875,Sen. Warren says there are 'lifetime consequen...,negative
2,2,1353562486125993984,"Look, her dog really did save my daughter's li...",neutral
3,3,1353563705523376128,Oakland’s mayor &amp; administrator are trying...,negative
4,4,1353586020000751616,Good thread https://t.co/rF5yytmv1y,positive


### Lower case

In [None]:
for i in df.index:
  df.at[i, 'full_text'] = df.at[i, 'full_text'].lower()

df.head()

  

Unnamed: 0.1,Unnamed: 0,Tweet_ID,full_text,Sentiment_Label
0,0,1353493911822454784,$enzc huge week ahead of us. we will set new h...,negative
1,1,1353539100448591875,sen. warren says there are 'lifetime consequen...,negative
2,2,1353562486125993984,"look, her dog really did save my daughter's li...",neutral
3,3,1353563705523376128,oakland’s mayor &amp; administrator are trying...,negative
4,4,1353586020000751616,good thread https://t.co/rf5yytmv1y,positive


###URL removal

In [None]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

In [None]:
for i in df.index:
  df.at[i, "full_text"] = remove_urls(df.at[i, "full_text"])

df['full_text'][0]


'$enzc huge week ahead of us. we will set new highs. pr is about to drop any day now. huge partnerships on the way. they are the kings of monoclonal antibodies. they have multiple patents that nobody can touch. they are ahead of the game. 🔥 #hiv #aids #covid #ms #cancer @tamu '

###Remove numbers

In [None]:
for i in df.index:
  df.at[i, "full_text"] = re.sub(r"\d+", "", df.at[i, "full_text"])

df.head()


Unnamed: 0.1,Unnamed: 0,Tweet_ID,full_text,Sentiment_Label
0,0,1353493911822454784,$enzc huge week ahead of us. we will set new h...,negative
1,1,1353539100448591875,sen. warren says there are 'lifetime consequen...,negative
2,2,1353562486125993984,"look, her dog really did save my daughter's li...",neutral
3,3,1353563705523376128,oakland’s mayor &amp; administrator are trying...,negative
4,4,1353586020000751616,good thread,positive


###Remove punctuation

In [None]:
for i in df.index:
  df.at[i, "full_text"] = df.at[i, "full_text"].translate(str.maketrans("", "", string.punctuation))

df.head()


Unnamed: 0.1,Unnamed: 0,Tweet_ID,full_text,Sentiment_Label
0,0,1353493911822454784,enzc huge week ahead of us we will set new hig...,negative
1,1,1353539100448591875,sen warren says there are lifetime consequence...,negative
2,2,1353562486125993984,look her dog really did save my daughters life...,neutral
3,3,1353563705523376128,oakland’s mayor amp administrator are trying t...,negative
4,4,1353586020000751616,good thread,positive


In [None]:
def remove_more(full_full_full_full_text):
  return full_full_full_full_text.replace('‘', '').replace('’', '').replace('”', '').replace('“' ,'').replace('•', '').replace('—', '').replace('किसानआंदोलनजारीरहेगा', '')

for i in df.index:
  df.at[i, "full_text"] = remove_more(df.at[i, "full_text"])

for i in df.index:
  df.at[i, "full_text"] = re.sub("[^a-z ]+", "", df.at[i, "full_text"])



In [None]:
df['full_text'][414]

'last chance to participate in the bn h eugreendeal call closing today help  respond to the climatecrisis protect our ecosystems amp biodiversity amp accelerate recovery from the covid crisissubmit your application by  pm '

###Emoji removal


In [None]:
def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

In [None]:
for i in df.index:
  df.at[i, "full_text"] = df.at[i, "full_text"].strip()

df['full_text'].apply(remove_emojis)

0       enzc huge week ahead of us we will set new hig...
1       sen warren says there are lifetime consequence...
2       look her dog really did save my daughters life...
3       oaklands mayor amp administrator are trying to...
4                                             good thread
                              ...                        
9718    my dad has cancer and my teacher mother is the...
9719          repkatieporter they also lost track of this
9720    the covid vaccine is not controversial in preg...
9721    the smugness of msm acting as if they dont con...
9722    oh to be a less than mediocre white dude in a ...
Name: full_text, Length: 9723, dtype: object

In [None]:
df['full_text'][0]

'enzc huge week ahead of us we will set new highs pr is about to drop any day now huge partnerships on the way they are the kings of monoclonal antibodies they have multiple patents that nobody can touch they are ahead of the game  hiv aids covid ms cancer tamu'

###Word Segmentation


In [None]:
load()

for i in df.index:
  df.at[i, "full_text"] = " ".join(word for word in segment(df.at[i, "full_text"]))


### Remove white spaces

In [None]:
for i in df.index:
  df.at[i, "full_text"] = df.at[i, "full_text"].strip()

df.head()

###Remove stop words

In [None]:
stop_words = set(stopwords.words('english'))

for i in df.index:
  tokens = word_tokenize(df.at[i, "full_text"])
  result = [i for i in tokens if not i in stop_words]
  df.at[i, "full_text"] = " ".join(e for e in result)

df['full_text'][1]


###Lemmatization

In [None]:
lemmatizer=WordNetLemmatizer()

for i in df.index:
  result_list = word_tokenize(df.at[i, "full_text"])
  df.at[i, "full_text"] = ' '.join(lemmatizer.lemmatize(word) for word in result_list)

df.head()

### Labeling

In [None]:
for i in df.index:
  if df.at[i, "Sentiment_Label"] == 'negative':
    df.at[i, "Sentiment_Label"] = 0

  if df.at[i, "Sentiment_Label"] == 'neutral':
    df.at[i, "Sentiment_Label"] = 1

  if df.at[i, "Sentiment_Label"] == 'positive':
    df.at[i, "Sentiment_Label"] = 2

###Droping Empty rows

In [None]:
for i in df.index:
  if df.at[i, "full_text"] == "":
    df.drop(i, inplace=True)

###Convert to csv

In [None]:
df.to_csv('cleaned_data_final4.csv', index=False)