# **EDA - EXPLORATORY DATA ANALYSIS** 

# Importing necessary libraries to analyse the data



In [61]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import seaborn as sns 
import matplotlib.pyplot as plt

Loading & Reading the data into a Pandas DataFrame of the Company named "Flysafe Airlines" website reviews.

In [62]:
filepath = "/Flysafe Airlines.csv"
try:
    df = pd.read_csv(filepath)
except pd.errors.ParserError as e:
    print(f"ParserError: {e}")
    print("Trying again with error_bad_lines=False")
    df = pd.read_csv(filepath, error_bad_lines=False)

Examining the data to get a sense of its structure, size, and format

In [63]:
df.head()

Unnamed: 0,no.,comment,label
0,0,"Mohammad harun, he is an awesome guy very info...",Postive
1,1,"amazing guy gaurav was, so patience and kind. ...",Postive
2,2,Gaurav was very knowledgeable and very helpful...,Postive
3,3,I called them regarding my flight cancellation...,Postive
4,4,VERY GOOD SERVICE BY GAURAV LOHAT SERVED AS SO...,Postive


In [64]:
df.tail()

Unnamed: 0,no.,comment,label
308379,308379,You have to double check price before place or...,Postive
308380,308380,My reserved flight was just as I expected. Th...,Postive
308381,308381,Quick easy and cheap.,Postive
308382,308382,Great experience & price,Postive
308383,308383,I found the website extremely helpful. I have ...,Postive


In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308384 entries, 0 to 308383
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   no.      308384 non-null  int64 
 1   comment  308384 non-null  object
 2   label    308384 non-null  object
dtypes: int64(1), object(2)
memory usage: 7.1+ MB


In [66]:
df.describe()

Unnamed: 0,no.
count,308384.0
mean,154191.5
std,89022.937044
min,0.0
25%,77095.75
50%,154191.5
75%,231287.25
max,308383.0


In [67]:
print(df.columns)

Index(['no.', 'comment', 'label'], dtype='object')


In [68]:
df = df[['comment','label']]
print(df.shape)

(308384, 2)


In [69]:
print(df['label'].value_counts())

Postive     271113
Negative     37271
Name: label, dtype: int64


In [71]:
negative_df = df[df['label'] == 'Negative']

print(negative_df.shape)
negative_df.head()

(37271, 2)


Unnamed: 0,comment,label
37,I have had nothing but trouble dealing with th...,Negative
63,Ankush was a great help and he explained me ea...,Negative
186,Scaaaam don't want to give me my refund and th...,Negative
190,ASAP Ticket is a SCAM!!! They have refused to ...,Negative
199,"So far so good for ASAP Tickets Team, especial...",Negative


In [72]:
sentiment_label = label_df.label.factorize()
sentiment_label

(array([0, 0, 0, ..., 0, 0, 0]),
 Index(['Postive', 'Negative'], dtype='object'))

Handling missing or null values

In [73]:
df.dropna()

Unnamed: 0,comment,label
0,"Mohammad harun, he is an awesome guy very info...",Postive
1,"amazing guy gaurav was, so patience and kind. ...",Postive
2,Gaurav was very knowledgeable and very helpful...,Postive
3,I called them regarding my flight cancellation...,Postive
4,VERY GOOD SERVICE BY GAURAV LOHAT SERVED AS SO...,Postive
...,...,...
308379,You have to double check price before place or...,Postive
308380,My reserved flight was just as I expected. Th...,Postive
308381,Quick easy and cheap.,Postive
308382,Great experience & price,Postive


In [74]:
df.isnull()

Unnamed: 0,comment,label
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
308379,False,False
308380,False,False
308381,False,False
308382,False,False


Cleaning and transforming the data.

In [83]:
df.duplicated()

0         False
1         False
2         False
3         False
4         False
          ...  
308379    False
308380    False
308381    False
308382    False
308383    False
Length: 304324, dtype: bool

In [76]:
df.to_csv('Flysafe Airline cleaned_data.csv', index=False)

# Traing & Testing the Data

In [77]:
def train_test_split_data(df, test_size, random_state):
    X_train, X_test, y_train, y_test = train_test_split(df["comment"], df["label"], test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test