In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import copy
import wordcloud

import nltk.downloader
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


params = {"negative": -1, "neutral": 0, "positive": 1}

# download stopwords
nltk.download('stopwords')

# Set shuffle randomstate; will be changed repeatedly
DATASHUFFLE_RANDOMSTATE = 0
training_score = 0.7
total_score = 0
# total_score_sq = 0
new_training_score = 0

average_score = 0
# stdev = 0

loop = False

while loop:
    
    print(f"Average score: %f (+%f) (Iter: %d)" % (average_score, new_training_score, DATASHUFFLE_RANDOMSTATE))
    
    print(".", end="")
    
    # Import training dataset
    df = pd.read_csv('Climate_Sentiments_Twitter.csv')
    # df.head()

    print(".", end="")
    
    # Transform `sympathy?` column to `sympathy`
    df.rename(columns={"sympathy?":"sympathy"}, inplace=True)

    print(".", end="")
    
    # Drop unneeded columns
    df = df[['text', 'sympathy']]
    df = df.sample(frac=1, random_state=DATASHUFFLE_RANDOMSTATE).reset_index(drop=True)
    # df.head()

    print(".", end="")
    
    # Check if there are rows with missing values
    # df.isna().value_counts()

    #Transforming the sympathy column to numerical values using the replacement method
    df['sympathy'].replace(params, inplace=True)
    # df.head()
    
    print(".", end="")

    #Creating the feature(s) and target vectors
    features = df.drop("sympathy", axis=1)
    target = df["sympathy"]
    # features.head()
    
    print(".", end="")

    #Tokenize the text data using RegexpTokenizer
    textdata = features['text']
    n = len(textdata)
    tokenizer = RegexpTokenizer(r'\w+')
    tokenizedtext = []
    
    print(".", end="")

    for i in range(n):
        #Convert text data to lowercase
        lower = textdata.iloc[i].lower()

        #Tokenize
        wordsarray = tokenizer.tokenize(lower)
        tokenizedtext.append(wordsarray)

    #print(tokenizedText)

    print(".", end="")
    
    #Remove stop words using nltk
    englishstopwords = set(stopwords.words('english'))
    englishstopwords.add("amp")
    shortertext = []
    
    print(".", end="")

    for tweet in tokenizedtext:
        shortertweet = []
        for word in tweet:
            if word not in englishstopwords:
                word = word.strip()
                if (word.isdigit() == False and len(word) >= 2):
                    shortertweet.append(word)
        shortertext.append(shortertweet)

    #print(shorterText)
    print(".", end="")
        

    #Stem using PorterStemmer
    porterstemmer = PorterStemmer()
    stemmedtext = []
    for tweet in shortertext:
        stemmedwords = []
        for word in tweet:
            stemmedwords.append(porterstemmer.stem(word))
        convertback = ' '.join(stemmedwords)
        stemmedtext.append(convertback)

    print(".", end="")
        
    #Create a separate dataframe dfcv for later EDA
    # dfcv = df.copy()
    # dfcv['text'] = stemmedtext

    # dfcv.head()

    #Vectorizing the text data using TFIDvectorizer for Modelling
    tfid = TfidfVectorizer()
    vectorizedtfid = tfid.fit_transform(stemmedtext)

    print(".", end="")
    
    #print(vectorizedtfid)
    #print(tfid.vocabulary_)

    #
    # Use "vectorizedtfid" for the features matrix
    # and "target" dataframe for the target vector
    #

    # Train, get score
    x_train, x_test, y_train, y_test = train_test_split(vectorizedtfid, target, test_size=0.30, shuffle=True, random_state=0)
    new_training_score = 0;
    for s in ['newton-cg','lbfgs','liblinear']:
        # print ("Solver: ", s)
        logisticRegr = LogisticRegression(C=1000000,solver=s,max_iter=1000000)
        logisticRegr.fit(x_train,y_train)
        score = logisticRegr.score(x_test,y_test)
        if s == 'lbfgs':
            new_training_score = score
        # print("Accuracy Score:", score)
    
    if new_training_score > training_score:
        training_score = new_training_score
        print("\n===========[HIGH SCORE]===============")
        print(f"%d: %f" % (DATASHUFFLE_RANDOMSTATE, training_score))
        print("======================================", end="")
    
    DATASHUFFLE_RANDOMSTATE += 1
    total_score += new_training_score
    # total_score_sq += new_training_score * new_training_score
    
    average_score = total_score / DATASHUFFLE_RANDOMSTATE
    # stdev = math.sqrt((total_score_sq/DATASHUFFLE_RANDOMSTATE) - math.pow(total_score/DATASHUFFLE_RANDOMSTATE,2))
    print()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\test\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
