In [29]:
import pandas as pd
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

# Load data
url = 'https://raw.githubusercontent.com/DyanelLancea/Airline-Review-Sentiement-Analysis/refs/heads/master/airlines_review.csv'
df = pd.read_csv(url, index_col=0)

# List of special characters to remove
removechar = ['!', '@', '#', '$', '%', '^', '&', '*', '(', ')',
               '-', '_', '=', '+', '{', '}', '[', ']', '|',
               '\\', ':', ';', '"', "'", '<', '>', ',', '.', '?',
                 '/', '~', '`', '✅ Trip Verified', 'Not Verified', 'Â Â']

# Remove duplicates
df = df.drop_duplicates()

# Replace missing values with NA or Unknown function
def replace_missing_value(df):
    for col in df.columns:
        if df[col].dtype == 'O':  # Object type (string)
            df[col] = df[col].fillna('Unknown')
        else:
            df[col] = df[col].fillna("NA")
    return 

# Remove special characters from specific columns function
def remove_special_characters(df, removechar, char):
    for char in removechar:
        df['Airlines'] = df['Airlines'].str.replace(char, ' ', regex=False)
        df['Text Content'] = df['Text Content'].str.replace(char, '', regex=False)
    return df

# Apply data cleaning functions
replace_missing_value(df)
remove_special_characters(df, removechar, char='')

# Standardize text case
df['Airlines'] = df['Airlines'].str.title()
df['Name'] = df['Name'].str.title()
df['Text Content'] = df['Text Content'].str.lower()

# Remove leading spaces from the 'Name' column
df['Airlines'] = df['Airlines'].str.lstrip()
df['Name'] = df['Name'].str.lstrip()
df['Date Published'] = df['Date Published'].str.lstrip()
df['Text Content'] = df['Text Content'].str.lstrip()

# Save cleaned data to a new CSV file
df.to_csv('airlines_review_cleaned.csv', index=False)

# Display the cleaned DataFrame
print(df)

# Sentiment analysis on 'Text Content' column & Display sentiment scores
sentiment_scores = sid.polarity_scores(df['Text Content'][35])
compound_score = sentiment_scores['compound']

if compound_score >= 0.05:
    sentiment = "Positive"
elif compound_score <= -0.05:
    sentiment = "Negative"
else:
    sentiment = "Neutral"

score_results = "Sentiment Analysis Result: " + str(scores)
print(score_results)
print(f"The sentiment of the text is: {sentiment}")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Derrick\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


              Airlines             Name Date Published  \
0        Qatar Airways     Romana Malik     2025-09-07   
1        Qatar Airways           J Raiz     2025-09-02   
2        Qatar Airways       Iman Yusuf     2025-09-01   
3        Qatar Airways     Ronald Zwart     2025-08-26   
4        Qatar Airways  Dmitriy Berezin     2025-08-21   
...                ...              ...            ...   
14586  Hainan Airlines        J Depaepe     2010-01-19   
14587  Hainan Airlines  Pieter D'Hamers     2010-01-10   
14588  Hainan Airlines           Y Chen     2010-01-09   
14589  Hainan Airlines          A Smith     2009-12-16   
14590  Hainan Airlines    Richard Borst     2009-12-11   

                                            Text Content  
0      we choose our seats when booking and they chan...  
1      initially i was supposed to be traveling with ...  
2      i want to sincerely thank qatar airways for th...  
3      boarding was efficient friendly personable wel...  
4      w