In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Prepare Data

## Import Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report, ConfusionMatrixDisplay
import re
import string
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
# Ignore FutureWarning messages
warnings.simplefilter(action='ignore', category=FutureWarning)

## Read csv files

In [None]:
d = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/train.csv',encoding='latin1');
f = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/test.csv',encoding='latin1');
df = pd.concat([d,f])

## Explore dataset

In [None]:
print(df.shape)
print(df.info())
df.head()

## Convert Data

In [None]:
df.dropna(inplace=True)

In [None]:
df['sentiment'].value_counts(normalize=True).plot(kind='bar');

In [None]:
# Convert sentiment column to categorical variable
df['sentiment'] = df['sentiment'].astype('category').cat.codes
df['sentiment'].value_counts(normalize=True).plot(kind='bar');

In [None]:
# Convert Time of Tweet column to categorical variable
df['Time of Tweet'] = df['Time of Tweet'].astype('category').cat.codes
# Convert Country column to categorical variable
df['Country'] = df['Country'].astype('category').cat.codes
# convert Age of User to integer 
df['Age of User']=df['Age of User'].replace({'0-20':18,'21-30':25,'31-45':38,'46-60':53,'60-70':65,'70-100':80})

## Drop unuseful data

In [None]:
df.info()

In [None]:
df.drop(columns=['textID','Time of Tweet', 'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)'])

## Creat a function to process text

In [None]:
def wp(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    return text

In [None]:
df['selected_text'] = df["selected_text"].apply(wp)

## Split data

In [None]:
X=df['selected_text']
y= df['sentiment']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

## Convert text to vectors

In [None]:
vectorization = TfidfVectorizer()
XV_train = vectorization.fit_transform(X_train)
XV_test = vectorization.transform(X_test)

## Baseline model

In [None]:
score_baseline = df['sentiment'].value_counts(normalize=True).max()
score_baseline

## Logistic regression

In [None]:
lr = LogisticRegression(n_jobs=-1)
lr.fit(XV_train,y_train)

In [None]:
pred_lr=lr.predict(XV_test)

In [None]:
# get accuracy score
score_lr = accuracy_score(y_test, pred_lr)
score_lr

In [None]:
print(classification_report(y_test, pred_lr))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, pred_lr);

## Decision Tree Classification

In [None]:
dt = DecisionTreeClassifier()
dt.fit(XV_train, y_train)

In [None]:
pred_dt = dt.predict(XV_test)

In [None]:
score_dt = dt.score(XV_test, y_test)
score_dt

In [None]:
print(classification_report(y_test, pred_dt))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, pred_dt);

## Random Forest Classifier

In [None]:
rfc = RandomForestClassifier(random_state=0)
rfc.fit(XV_train, y_train)

In [None]:
pred_rfc = rfc.predict(XV_test)

In [None]:
score_rfc = rfc.score(XV_test, y_test)
score_rfc

In [None]:
print(classification_report(y_test, pred_rfc))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, pred_rfc);

## Best model in accuracy 

In [None]:
print(f'Baseline model:{score_baseline} \nLogistic regression {score_lr}\nDecision Tree Classification {score_dt}\nRandim Forest Classifier {score_rfc}')

# Communicate the result 

In [None]:
def output_lable(n):
    if n == 0:
        return "The Text Sentement is Negative"
    elif n == 1:
        return "The Text Sentement is Neutral"
    elif n == 2:
        return "The Text Sentement is Positive"
    
def manual_testing(news):
    testing_news = {"text":[news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test["text"] = new_def_test["text"].apply(wp) 
    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)
    pred_lr = lr.predict(new_xv_test)
    pred_dt = dt.predict(new_xv_test)
    pred_rfc = rfc.predict(new_xv_test)

    return print((output_lable(pred_lr[0])))

In [None]:
# enter a text to find if its sentiment
text = "Iam Sad"
manual_testing(text)