In [None]:
# Basic Operation
import pandas as pd
import numpy as np

# Text Preprocessing & Cleaning
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import re


from sklearn.model_selection import train_test_split # Split Data
from imblearn.over_sampling import SMOTE # Handling Imbalanced

# Model Building
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.svm import SVC


from sklearn.metrics import classification_report , confusion_matrix , accuracy_score # Performance Metrics


# Data Visualization
import matplotlib.pyplot as plt





In [None]:
df = pd.read_csv('/content/Tweets.csv')


In [None]:
# convert Sentiments to 0,1,2
def convert_Sentiment(sentiment):
    if  sentiment == "positive":
        return 2
    elif sentiment == "neutral":
        return 1
    elif sentiment == "negative":
        return 0

In [None]:
# Apply convert_Sentiment function
df.airline_sentiment = df.airline_sentiment.apply(lambda x : convert_Sentiment(x))

In [None]:
import nltk


In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Remove stop words
def remove_stopwords(text):
    text = ' '.join([word for word in text.split() if word not in (stopwords.words('english'))])
    return text

# Remove url
def remove_url(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

# Remove punct
def remove_punct(text):
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)

# Remove html
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

# Remove @username
def remove_username(text):
    return re.sub('@[^\s]+','',text)

# Remove emojis
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


# Decontraction text
def decontraction(text):
    text = re.sub(r"won\'t", " will not", text)
    text = re.sub(r"won\'t've", " will not have", text)
    text = re.sub(r"can\'t", " can not", text)
    text = re.sub(r"don\'t", " do not", text)

    text = re.sub(r"can\'t've", " can not have", text)
    text = re.sub(r"ma\'am", " madam", text)
    text = re.sub(r"let\'s", " let us", text)
    text = re.sub(r"ain\'t", " am not", text)
    text = re.sub(r"shan\'t", " shall not", text)
    text = re.sub(r"sha\n't", " shall not", text)
    text = re.sub(r"o\'clock", " of the clock", text)
    text = re.sub(r"y\'all", " you all", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"n\'t've", " not have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'d've", " would have", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ll've", " will have", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"\'re", " are", text)
    return text

# Seperate alphanumeric
def seperate_alphanumeric(text):
    words = text
    words = re.findall(r"[^\W\d_]+|\d+", words)
    return " ".join(words)

def cont_rep_char(text):
    tchr = text.group(0)

    if len(tchr) > 1:
        return tchr[0:2]

def unique_char(rep, text):
    substitute = re.sub(r'(\w)\1+', rep, text)
    return substitute

def char(text):
    substitute = re.sub(r'[^a-zA-Z]',' ',text)
    return substitute

# combaine negative reason with  tweet (if exsist)
df['final_text'] = df['negativereason'].fillna('') + ' ' + df['text']


# Apply functions on tweets
df['final_text'] = df['final_text'].apply(lambda x : remove_username(x))
df['final_text'] = df['final_text'].apply(lambda x : remove_url(x))
df['final_text'] = df['final_text'].apply(lambda x : remove_emoji(x))
df['final_text'] = df['final_text'].apply(lambda x : decontraction(x))
df['final_text'] = df['final_text'].apply(lambda x : seperate_alphanumeric(x))
df['final_text'] = df['final_text'].apply(lambda x : unique_char(cont_rep_char,x))
df['final_text'] = df['final_text'].apply(lambda x : char(x))
df['final_text'] = df['final_text'].apply(lambda x : x.lower())
df['final_text'] = df['final_text'].apply(lambda x : remove_stopwords(x))

In [None]:
from textblob import TextBlob
# Step 4: Define a function to calculate sentiment scores using TextBlob
def calculate_sentiment(text):
    analysis = TextBlob(text)
    # Sentiment polarity ranges from -1 (negative) to 1 (positive)
    return analysis.sentiment.polarity

# Apply the function to the 'text' column in your DataFrame and store the results in a new 'Sentiment Score' column
df['Sentiment Score'] = df['final_text'].apply(calculate_sentiment)

In [None]:
X = df['final_text']
y = df['airline_sentiment']

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Step 2: Apply TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000,ngram_range=(1, 2))  # Adjust max_features as needed
text_tfidf = tfidf_vectorizer.fit_transform(df['final_text'])

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)


In [None]:
y

array([1, 2, 1, ..., 1, 0, 1])

In [None]:
from imblearn.over_sampling import SMOTE

# Assuming 'text_tfidf' and 'y' are defined correctly
smote = SMOTE(random_state=42)
x_sm, y_sm = smote.fit_resample(text_tfidf, y)


In [None]:
# Split Data into train & test
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size=0.2)

<14640x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 129044 stored elements in Compressed Sparse Row format>

In [None]:
# Split Data into train & test
X_train , X_test , y_train , y_test = train_test_split(x_sm , y_sm , test_size=0.2)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(p=1,n_jobs=30)
knn.fit(X_train,y_train)

ValueError: ignored

In [None]:
knn_predict = knn.predict(X_test)

NotFittedError: ignored

In [None]:
accuracy_score(knn_predict,y_test)

ValueError: ignored

In [None]:
from sklearn.svm import SVC
svc = SVC(kernel='rbf',C=2.0,random_state=52)
svc.fit(X_train,y_train)

In [None]:
svc_predict = svc.predict(X_test)

In [None]:
accuracy_score(svc_predict,y_test)

0.9442527692028327

AttributeError: ignored

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train,y_train)

In [None]:
rf_prediction =  rf.predict(X_test)

In [None]:
accuracy_score(rf_prediction,y_test)

0.951334664971854

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
model_11 = ExtraTreesClassifier(n_estimators=25,max_depth=300,random_state=23)
model_11.fit(X_train,y_train)

In [None]:
m11_pred = model_11.predict(X_test)

In [None]:
accuracy_score(m11_pred,y_test)

0.949337207190848

In [None]:
import lightgbm as lgb
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)


In [None]:
train_data

<lightgbm.basic.Dataset at 0x7fc76df022c0>

In [None]:
params = {

    "num_leaves": 300,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "verbose": -1
}

In [None]:
num_rounds = 150  # Number of boosting rounds (iterations)
model = lgb.train(params, train_data, num_rounds, valid_sets=[train_data, test_data])


In [None]:
y_pred = model.predict(X_test)


In [None]:
y_pred_class = np.round(y_pred)  # Convert probabilities to binary class predictions
accuracy = accuracy_score(y_test, y_pred_class)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.92


In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=300)
dt.fit(X_train,y_train)

In [None]:
dt_predict = dt.predict(X_test)

In [None]:
accuracy_score(dt_predict,y_test)

0.9368076992918104

In [None]:
import xgboost as xgb

# Define the XGBoost model with specified hyperparameters
xgb_model = xgb.XGBClassifier(max_depth=300, learning_rate=0.1, n_estimators=100)


In [None]:
# Train the model on the training data
xgb_model.fit(X_train, y_train)


In [None]:

# Make predictions on the test data
y_pred_xgb = xgb_model.predict(X_test)

In [None]:
accuracy_score(y_pred_xgb,y_test)

0.9433448338478301

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC

# Define the base models
decision_tree = DecisionTreeClassifier(max_depth=3)
random_forest = RandomForestClassifier(n_estimators=100)
svm_model = SVC(probability=True)

# Create the Voting Classifier
voting_classifier = VotingClassifier(estimators=[
    ('decision_tree', decision_tree),
    ('random_forest', random_forest),
    ('svm', svm_model)
], voting='soft')  # 'soft' for probability voting, 'hard' for majority voting

# Train the Voting Classifier
voting_classifier.fit(X_train, y_train)

# Evaluate the model
accuracy = voting_classifier.score(X_test, y_test)
print(f"Accuracy: {accuracy}")


Accuracy: 0.9451607045578355
