In [1]:
import pandas as pd
df = pd.read_excel('NIT Trichy.xlsx')
df = df.astype(str)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225 entries, 0 to 224
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Review_title  225 non-null    object
 1   Body          225 non-null    object
dtypes: object(2)
memory usage: 3.6+ KB


In [2]:
df.head()

Unnamed: 0,Review_title,Body
0,This is the best college one could get into NITT,College Infrastructure\nThis is 800 acres camp...
1,Life in Nit Trichy,College Infrastructure\nThe infrastructure is ...
2,Feeling happy for getting into such a good col...,"College Infrastructure\nYes,it's hygienic and ..."
3,The best college for all round activities,College Infrastructure\nThe infrastructure her...
4,Very nice.,College Infrastructure\nThe infrastructure of ...


#  ------------------Cleaning the data--------------------------

In [5]:
import re


def CleanTxt(text):
    text = re.sub("[^A-Za-z0-9]+"," ", text)

    
    return text

df['clear r'] = df['Body'].apply(CleanTxt)

# --------------Converting to lower case-----------------

In [6]:
def low(a):
    return a.lower()

df['clear r'] = df['clear r'].apply(low)

# -------------------Tokenization-------------------------------

In [7]:
import nltk
nltk.download('punkt')
    
from nltk.tokenize import word_tokenize
def Tokenization(dataset):
    Tokens = word_tokenize(dataset)
    return Tokens

df['tokens'] = df['clear r'].apply(Tokenization)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\214220002\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# -----------------------Stemming------------------------------

In [8]:
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

pst=PorterStemmer()
lancaster=LancasterStemmer()

def stem(text):
    rev = []
    for word in text:
        rev.append(pst.stem(word))
    return rev
    

df['clean review'] = df['tokens'].apply(stem)

# ----------------Remove the Stop Words--------------------

In [9]:
import nltk.corpus

nltk.download('stopwords')


stopwords = nltk.corpus.stopwords.words('english')


def ReStop(text):
    filtered_sentence = []   
    for FinalWord in text:
     if FinalWord not in stopwords:
        filtered_sentence.append(FinalWord) 
        
    return filtered_sentence


df['clean review'] = df['clean review'].apply(ReStop)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\214220002\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# ----------------------Joining Tokens--------------------------

In [10]:
def join(filtered_sentence):
   filtered_sentence = ' '.join([str(elem) for elem in filtered_sentence])
   return filtered_sentence

df['clean review'] = df['clean review'].apply(join)


# -----------------Analysing Statement--------------------

In [11]:
from textblob import TextBlob

def Sub(text):
    return TextBlob(text).sentiment.subjectivity

def pol(text):
    return TextBlob(text).sentiment.polarity


df['Subjectivity'] = df['clean review'].apply(Sub)
df['Polarity'] = df['clean review'].apply(pol)

# ------------------Get ordinal sentiment score-----------

In [12]:
def GetAna(score):
    if score < 0:
        return 'NEGATIVE'
    elif score == 0 :
        return 'NEUTRAL'
    else:
        return 'POSITIVE'
    
    
df['Analysis'] = df['Polarity'].apply(GetAna)

# ---------------------Export data to file-------------------- 

In [13]:
df = df.drop('clear r', axis = 1)
df = df.drop('tokens', axis = 1)

df.to_excel('Cleaned Review.xlsx')