In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from wordcloud import WordCloud
from collections import Counter
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.feature_extraction import _stop_words

train=pd.read_csv("train_dataset.csv")
test=pd.read_csv("test_dataset.csv")
vaal=pd.read_csv("val_dataset.csv")
mod=LogisticRegression(max_iter=5000)
#print("\nMissing values:\n", train.isnull().sum())
#sns.countplot(x='Label', data=train, palette='viridis')
#plt.title('Label distr')
#plt.show()


def common_distr(feature_names, matrix, n=20):
    sums = matrix.sum(axis=0)
    words_scores = [(feature_names[idx], sums[0, idx]) for idx in range(len(feature_names))]   #τα feature names είναι tokenized words και για
                                                                                        #καθεναμ=, το σκορ είναι το αθροισμα τωνφορων που εμφανιζεται
    words_scores = sorted(words_scores, key=lambda x: x[1], reverse=True)
    return words_scores[:n]

def plot_balance():
    train0=len(train[train["Label"]==0])
    train1=len(train[train["Label"]==1])
    print(train0)
    

def plot_length_distr():
    train["Text_len"]=train["Text"].apply(len)
    #print(train)
    plt.figure(figsize=(12,5))
    sns.histplot(train[train['Label']==0]['Text_len'], bins=30, color='red', label='Label 0', kde=True )
    sns.histplot(train[train['Label']==1]['Text_len'], bins=30, color='green', label='Label 1', kde=True )
    plt.title("0 vs 1 length distribution")
    plt.ylabel("tweets")
    plt.legend()
    plt.show()
def workcloud(label, color):
    
    text = " ".join(train[train['Label'] == label]['Text_processed'])
    wordcloud = WordCloud(width=800, height=400, background_color='white', colormap=color).generate(text)
    
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"Word Cloud for Category {label}")
    plt.show()


def analysis_before_preprocess():
    plot_balance()
    plot_length_distr


def preprocess(txt):
    txt=txt.lower()
    txt=re.sub(r'http\S+', '', txt) #remove links
    txt=re.sub(r'@\w+', '', txt)   #remove mentions
    txt = re.sub(r'[^\w\s]', '', txt)  # Remove !/? klp...
    #must tokenize words

    #Do stemming / lemmat
    
 
    return txt
    
def print_common_words(matrix0, matrix1):
    feature_names = vectorizer.get_feature_names_out()
    common_words_0=common_distr(feature_names, matrix0)
    common_words_1=common_distr(feature_names, matrix1)
    print("Most common words for Label 0:")
    print(common_words_0)
    print("\nMost common words for Label 1:")
    print(common_words_1)


#Adding some words in the stopwords list since i saw that these are frequently used
default_stopwords = _stop_words.ENGLISH_STOP_WORDS
stopwords = {'im', 'ive', 'dont', 'wont', 'cant', 'isnt', 'doesnt', 'just', 'like', 'got', 'day'}
custom_stopwords = set(default_stopwords) | stopwords

def process():
    analysis_before_preprocessing()
    train['Text_processed']=train['Text'].apply(preprocess)  #preprocess text
    vaal['Text_processed']=vaal['Text'].apply(preprocess)
    #then vectorize it, both training and evaluation data sets
    vectorizer = TfidfVectorizer(stop_words=list(custom_stopwords), max_features=3000)
    Xtrain=vectorizer.fit_transform(train['Text_processed'])  #FEAUture matrix, has "documents" as rows and each col is a word? 
    Xval=vectorizer.transform(vaal['Text_processed'])
    ytrain=train['Label']
    yval=vaal['Label']
    #Splitting the categories of the train data set
    matrix0=Xtrain[train['Label']==0]
    matrix1=Xtrain[train['Label']==1]
    


#workcloud(1, 'Greens')
#workcloud(0, 'Blues')


def print_stats(name, y,ypred):
    accuracy_v = accuracy_score(y, ypred)
    #precision_v = precision_score(yval, y_val_pred)
    #recall_v = recall_score(yval, y_val_pred)
    #f1_v = f1_score(yval, y_val_pred)
    print(f"{name} Accuracy: {accuracy:.2f}")
    #print(f"Validation Precision: {precision:.2f}")
    #print(f"Validation Recall: {recall:.2f}")
    #print(f"Validation F1-Score: {f1:.2f}")
    conf_matrix = confusion_matrix(y, ypred)
'''  
 print(f"{name} Confusion Matrix:")
    print(conf_matrix)
    
    
    plt.figure(figsize=(6, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False,
                xticklabels=['Predicted 0', 'Predicted 1'],
                yticklabels=['Actual 0', 'Actual 1'])
    plt.title("Confusion Matrix Heatmap")
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.show()
'''
mod.fit(Xtrain, ytrain)  #trainig
y_train_pred=mod.predict(Xtrain)
y_val_pred=mod.predict(Xval) #evaluation
print_stats("\n\nTrain", ytrain, y_train_pred)
print_stats("Evaluation", yval, y_val_pred)


Most common words for Label 0:
[('work', 1333.3653846161308), ('miss', 1112.1485382882022), ('want', 1036.4553195957162), ('today', 1007.8550636274665), ('sad', 995.4761172544956), ('really', 922.5046922936124), ('going', 914.5355797564155), ('wish', 829.2407574065577), ('know', 796.1591348806766), ('sorry', 759.4864034546129), ('time', 744.1179521057421), ('good', 730.47953932828), ('home', 728.8617696036015), ('oh', 718.1051067115317), ('feel', 717.5006944265702), ('bad', 689.2460326236932), ('need', 688.8217472763345), ('didnt', 654.1969682801148), ('think', 639.0556681055214), ('sleep', 621.1451915134836)]

Most common words for Label 1:
[('good', 1729.2745383590425), ('love', 1435.5672123183656), ('thanks', 1426.0209102112717), ('lol', 1030.6625781265848), ('going', 895.9255892010035), ('time', 868.6139667189424), ('great', 848.0968033762986), ('new', 805.6775873850534), ('today', 791.1205217066893), ('know', 777.2332973029571), ('thank', 738.5427906134265), ('happy', 715.01179937