<a href="https://colab.research.google.com/github/Aneesh-CQ/Aneesh-CQ.github.io/blob/main/Natural_Language_Processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NATURAL LANGUAGE PROCESSING: FLIPKART REVIEW SENTIMENT ANALYSIS

### Importing Libaraies

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import re
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from wordcloud import WordCloud

### Using NTK Library for Natural Language Processing

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk import punkt

### Loading the Dataset

In [None]:
data = pd.read_csv('flipkart_data.csv')
data.head()

### Data Preprocessing and Preliminary analysis





In [None]:
sns.countplot(data=data, x='rating')

### Creating a seperate column to classify the reviews as positive or negative based on ratings
If rating is less than 5, consider it negative, else, consider it positive

In [None]:
pos_neg = []
for i in range (len(data['rating'])):
  if data['rating'][i] == 5:
    pos_neg.append(1)
  else:
    pos_neg.append(0)
data['classify'] = pos_neg
data

In [None]:
from tqdm import tqdm


def preprocess_text(text_data):
	preprocessed_text = []

	for sentence in tqdm(text_data):
		# Removing punctuations
		sentence = re.sub(r'[^\w\s]', '', sentence)

		# Converting lowercase and removing stopwords
		preprocessed_text.append(' '.join(token.lower()
										for token in nltk.word_tokenize(sentence)
										if token.lower() not in stopwords.words('english')))

	return preprocessed_text


In [None]:
preprocessed_review = preprocess_text(data['review'].values)
data['review'] = preprocessed_review
data.head()

### Creating a Wordcloud for visualization of the frequency of words

In [None]:
consolidated = ' '.join(
    word for word in data['review'][data['classify'] == 1].astype(str))
wordCloud = WordCloud(width=1600, height=800,
                      random_state=21, max_font_size=110)
plt.figure(figsize=(15, 10))
plt.imshow(wordCloud.generate(consolidated), interpolation='bilinear')
plt.axis('off')
plt.show()

### Transforming text data using TF-IDF

In [None]:
cv = TfidfVectorizer(max_features=2500)
X = cv.fit_transform(data['review'] ).toarray()
X

### Model Training, Testing and Evaluation

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, data['classify'],
                                                    test_size=0.25,
                                                    stratify=data['classify'])

In [None]:
'''from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
model = DecisionTreeClassifier(random_state=0)
model.fit(X_train,y_train)

#testing the model
pred = model.predict(X_train)
print(accuracy_score(y_train,pred))'''

In [None]:
import xgboost as xgb
xgb.Classifier().get_params()

In [None]:
# we initiate the regression model and train it with our train data
xg_clf = xgb.XGBClassifier()
# training the model
xg_clf.fit(X_train,y_train)

In [None]:
# predicting the outputs
xgb_preds = xg_clf.predict(X_test)
xgb_preds

In [None]:
#testing the model
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,xgb_preds))