In [1]:
#import neccessary libraries
import numpy as np
import pandas as pd
import re
import nltk
import spacy
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
pd.options.mode.chained_assignment = None

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [2]:
#Load dataset
full_df = pd.read_csv("/content/drive/Othercomputers/My Laptop/projects/FlexiSAF/Text_preprocessing/sample.csv")
df = full_df[["text"]]
df["text"] = df["text"].astype(str)
df.head()

Unnamed: 0,text
0,@AppleSupport causing the reply to be disregar...
1,@105835 Your business means a lot to us. Pleas...
2,@76328 I really hope you all change but I'm su...
3,@105836 LiveChat is online at the moment - htt...
4,@VirginTrains see attached error message. I've...


In [4]:
#Function for Lowercasing
def to_lowercase(text):
  return text.lower()

df["text_lowercase"] = df["text"].apply(lambda text: to_lowercase(text))
df.head()

Unnamed: 0,text,text_lowercase
0,@AppleSupport causing the reply to be disregar...,@applesupport causing the reply to be disregar...
1,@105835 Your business means a lot to us. Pleas...,@105835 your business means a lot to us. pleas...
2,@76328 I really hope you all change but I'm su...,@76328 i really hope you all change but i'm su...
3,@105836 LiveChat is online at the moment - htt...,@105836 livechat is online at the moment - htt...
4,@VirginTrains see attached error message. I've...,@virgintrains see attached error message. i've...


In [6]:
#Function to remove punctuations
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

df["text_wo_punctuations"] = df["text_lowercase"].apply(lambda text: remove_punctuation(text))
df.head()

Unnamed: 0,text,text_lowercase,text_wo_punctuations
0,@AppleSupport causing the reply to be disregar...,@applesupport causing the reply to be disregar...,applesupport causing the reply to be disregard...
1,@105835 Your business means a lot to us. Pleas...,@105835 your business means a lot to us. pleas...,105835 your business means a lot to us please ...
2,@76328 I really hope you all change but I'm su...,@76328 i really hope you all change but i'm su...,76328 i really hope you all change but im sure...
3,@105836 LiveChat is online at the moment - htt...,@105836 livechat is online at the moment - htt...,105836 livechat is online at the moment https...
4,@VirginTrains see attached error message. I've...,@virgintrains see attached error message. i've...,virgintrains see attached error message ive tr...


In [8]:
#Function to remove stopwords

#set of stop words
stop_words = set(stopwords.words('english'))
print(list(stop_words)[:10])

def remove_stopwords(text):
  return " ".join([word for word in str(text).split() if word not in stop_words])

df["text_wo_stopwords"] = df["text_wo_punctuations"].apply(lambda text: remove_stopwords(text))
df["text_wo_stopwords"].head()

['be', "don't", 'on', 'his', "doesn't", "they'd", 'now', 'again', 've', 'do']


Unnamed: 0,text_wo_stopwords
0,applesupport causing reply disregarded tapped ...
1,105835 business means lot us please dm name zi...
2,76328 really hope change im sure wont dont
3,105836 livechat online moment httpstcosy94vtu8...
4,virgintrains see attached error message ive tr...


In [14]:
# function to remove frequent words

# firstly looping through the values for the texts without stop words
# to find the top ten most common words which would become the frequent words

from collections import Counter
cnt = Counter()
for text in df["text_wo_stopwords"].values:
    for word in text.split():
        cnt[word] += 1

print(cnt.most_common(10))

FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
def remove_freqwords(text):
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])

df["text_wo_stopfreq"] = df["text_wo_stopwords"].apply(lambda text: remove_freqwords(text))
df["text_wo_stopfreq"].head()

[('us', 25), ('dm', 19), ('help', 18), ('thanks', 13), ('httpstcogdrqu22ypt', 12), ('applesupport', 11), ('please', 11), ('phone', 9), ('hi', 9), ('ive', 8)]


Unnamed: 0,text_wo_stopfreq
0,causing reply disregarded tapped notification ...
1,105835 business means lot name zip code additi...
2,76328 really hope change im sure wont dont
3,105836 livechat online moment httpstcosy94vtu8...
4,virgintrains see attached error message tried ...


In [16]:
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [17]:
# function to lemmatize taking the part of speech into consideration

from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

df["text_lemmatized"] = df["text_wo_stopfreq"].apply(lambda text: lemmatize_words(text))
df["text_lemmatized"].head()

Unnamed: 0,text_lemmatized
0,cause reply disregard tapped notification keyb...
1,105835 business mean lot name zip code additio...
2,76328 really hope change im sure wont dont
3,105836 livechat online moment httpstcosy94vtu8...
4,virgintrains see attach error message try leav...


In [29]:
pip install pyspellchecker

Collecting pyspellchecker
  Downloading pyspellchecker-0.8.3-py3-none-any.whl.metadata (9.5 kB)
Downloading pyspellchecker-0.8.3-py3-none-any.whl (7.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m41.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.3


In [36]:
#function to correct spellings

from spellchecker import SpellChecker

spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        corrected = spell.correction(word)
        if (word in misspelled_words) and corrected:
            corrected_text.append(corrected)
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)

text = "This sentnce shold be corrrekted"
correct_spellings(text)

'This sentence should be corrected'

In [40]:
text2 = "my neme is nott vry importand pleese calll mie wahteverr yuo lke"
correct_spellings(text2)

'my name is not very important please call me whatever you like'