In [1]:
%cd drive/MyDrive/Thesis\ Datasets
import csv
import string
import warnings
warnings.filterwarnings('ignore')

/content/drive/MyDrive/Thesis Datasets


In [2]:
!pip install pysentiment2

Collecting pysentiment2
  Downloading pysentiment2-0.1.1-py3-none-any.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 5.1 MB/s 
Installing collected packages: pysentiment2
Successfully installed pysentiment2-0.1.1


In [None]:
def create_sentence_data(dataset, step):
  names = []
  positions = []
  ethnicity = []
  total = 0
  ids = []
  id = 0
  reports = []
  header = ["ID", "Position", "Scouting Report Sentence(s)", "Race"]
  with open(dataset) as csvfile:
    csvreader = csv.reader(csvfile)
    next(csvreader)
    for row in csvreader:
      id +=1
      sent = row[2].split(".")
      for i in range(0, len(sent)-1, step):
        ids.append(id)
        names.append(row[0])
        positions.append(row[1])
        sentence = str(sent[i:i+step]).translate(str.maketrans('', '', '[]\''))
        reports.append(sentence)
        ethnicity.append(row[3])
  with open(str(step)+"_sentence_data_2021.csv", "w") as file:
    writer = csv.writer(file)
    writer.writerow(header)
    for i in range(0, len(names)):
      data = [ids[i], positions[i], reports[i], ethnicity[i]]
      writer.writerow(data)
        
create_sentence_data("dataset_2021_weka.csv", 1)

In [None]:
create_sentence_data("dataset_2021_weka.csv", 2)      
create_sentence_data("dataset_2021_weka.csv", 5)

In [39]:
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.pipeline import Pipeline
import numpy as np
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline as imb_Pipeline
from numpy import mean
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
import string

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [37]:
def get_accuracy_oversampling(dataset):

  reports = []
  eth = []
  ps = PorterStemmer()
  stop_words = set(stopwords.words('english'))
  with open(dataset) as csvfile:
    csvreader = csv.reader(csvfile)
    next(csvreader)
    for row in csvreader:
      # some preproccessing
      # removing stopwords
      word_tokens = word_tokenize(row[2])
      filtered = [w for w in word_tokens if not w.lower() in stop_words]
      filtered = []
      for w in word_tokens:
        if w not in stop_words:
          filtered.append(w)
      rep = " "
      rep = rep.join(filtered)

      # removing punctuation
      rep.translate(str.maketrans('', '', string.punctuation))
      # stemming
      reports.append(ps.stem(rep))
      eth.append(row[3])


  rep_train, rep_test, y_train, y_test = train_test_split(reports, eth, test_size = 0.25)

  vectorizer = CountVectorizer()
  vectorizer.fit(rep_train)
  X_train = vectorizer.transform(rep_train)
  X_test = vectorizer.transform(rep_test)
  
  # LR
  steps = [('over', RandomOverSampler()), ('model', LogisticRegression())]
  pipeline = imb_Pipeline(steps=steps)
  cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)
  scores = cross_val_score(pipeline, X_test, y_test, cv=cv)
  score = mean(scores)
  print(f"LR accuracy: {round(score, 3)}.")

  # NB
  steps = [('over', RandomOverSampler()), ('model', MultinomialNB())]
  pipeline = imb_Pipeline(steps=steps)
  cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)
  scores = cross_val_score(pipeline, X_test, y_test, cv=cv, scoring='f1')
  score = mean(scores)
  print(f"NB accuracy: {round(score, 3)}.")

  # SVM
  steps = [('over', RandomOverSampler()), ('model', svm.SVC())]
  pipeline = imb_Pipeline(steps=steps)
  cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)
  scores = cross_val_score(pipeline, X_test, y_test, cv=cv, scoring='f1')
  score = mean(scores)
  print(f"SVM accuracy: {round(score, 3)}.")

  # SGD
  steps = [('over', RandomOverSampler()), ('model', SGDClassifier())]
  pipeline = imb_Pipeline(steps=steps)
  cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)
  scores = cross_val_score(pipeline, X_test, y_test, cv=cv, scoring='f1')
  score = mean(scores)
  print(f"SGD accuracy: {round(score, 3)}.")

  

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [43]:
print("1 sentence accuracy...")
get_accuracy_oversampling("1_sentence_data_2021.csv")
print("\n2 sentence accuracy...")
get_accuracy_oversampling("2_sentence_data_2021.csv")
print("\n5 sentence accuracy...")
get_accuracy_oversampling("5_sentence_data_2021.csv")
print("\nAll sentence accuracy...")
get_accuracy_oversampling("dataset_2021_weka.csv")

1 sentence accuracy...
LR accuracy: 0.735.
NB accuracy: 0.669.
SVM accuracy: 0.82.
SGD accuracy: 0.741.

2 sentence accuracy...
LR accuracy: 0.745.
NB accuracy: 0.728.
SVM accuracy: 0.802.
SGD accuracy: 0.733.

5 sentence accuracy...
LR accuracy: 0.796.
NB accuracy: 0.792.
SVM accuracy: 0.791.
SGD accuracy: 0.77.

All sentence accuracy...
LR accuracy: 0.812.
NB accuracy: 0.831.
SVM accuracy: 0.831.
SGD accuracy: 0.84.


In [12]:
# read in GI as key-value pairs with keys being preprocessed (famili, confid, unnecessari, etc)
import csv
import pysentiment2 as ps
import re

def read_GI(filename):
  kvp = {}
  with open(filename ,'r', ) as file:
    csvreader = csv.reader(file)
    next(csvreader)
    next(csvreader)
    for col in csvreader:
      if col[1] != "Lvd": 
        kvp[col[0]] = [str(word) for word in col[2:-2] if word]
  
  kvp2 = {}
  hiv4 = ps.HIV4()   
  for key in kvp:
    new_key = str(hiv4.tokenize(key))
    # print(new_key)
    #print(bool(new_key))
    res = re.findall(r'\w+', new_key)
    # print(res)
    if res: 
      #print(res[0])
      kvp2[res[0]] = kvp[key].copy()
    else:
      kvp2[new_key] = kvp[key].copy()
  return kvp2

key_val = read_GI("inquirerbasic_py.csv")

In [23]:
# creating new csv's with GI sentiment for sentence(s)
def create_GI_sentiment_csv(dataset, gi_val):
  stop = 0
  tok_data = []
  names = []
  positions = []
  ethnicity = []
  total = 0
  ids = []
  id = 1
  header = ["ID", "Position", "Scouting Report Sentence(s)", "Race"]
  with open(dataset) as csv_file:
    csvreader = csv.reader(csv_file, delimiter=',')
    hiv4 = ps.HIV4()
    next(csvreader)
    for row in csvreader:
      tok_sent = []
      ids.append(row[0])
      names.append(row[0])
      positions.append(row[1])
      ethnicity.append(row[3])

      text = row[2]
      tokens = hiv4.tokenize(text)
      for tok in tokens:
        for words in gi_val:
          if tok == words:
            #print(f"{tok} senti is: {gi_val[words]}.")
            tok_sent.append(gi_val[words])
      x = [' '.join(y) for y in tok_sent]
      row_string = "".join(x)
      tok_data.append(row_string)
    id+=1
  #print(tok_data)
  with open("gi_"+str(dataset), "w") as file:
    writer = csv.writer(file)
    writer.writerow(header)
    for i in range(0, len(names)):
      #print(ids[i], positions[i], tok_data[i], ethnicity[i])
      data = [ids[i], positions[i], tok_data[i], ethnicity[i]]
      writer.writerow(data)




In [46]:
print(key_val["confid"])

['Positiv', 'Pstv', 'Strong', 'Power', 'Pleasur', 'EMOT', 'Ovrst', 'WlbPsyc', 'WlbTot']


In [24]:
create_GI_sentiment_csv("1_sentence_data_2021.csv", key_val)
create_GI_sentiment_csv("2_sentence_data_2021.csv", key_val)
create_GI_sentiment_csv("5_sentence_data_2021.csv", key_val)
create_GI_sentiment_csv("dataset_2021_weka.csv", key_val)

In [44]:
print("1 sentence accuracy for GI tagged reports...")
get_accuracy_oversampling("gi_1_sentence_data_2021.csv")
print("\n2 sentence accuracy for GI tagged reports...")
get_accuracy_oversampling("gi_2_sentence_data_2021.csv")
print("\n5 sentence accuracy for GI tagged reports...")
get_accuracy_oversampling("gi_5_sentence_data_2021.csv")
print("\nAll sentence accuracy for GI tagged reports...")
get_accuracy_oversampling("gi_dataset_2021_weka.csv")

1 sentence accuracy for GI tagged reports...
LR accuracy: 0.704.
NB accuracy: 0.661.
SVM accuracy: 0.784.
SGD accuracy: 0.698.

2 sentence accuracy for GI tagged reports...
LR accuracy: 0.679.
NB accuracy: 0.673.
SVM accuracy: 0.75.
SGD accuracy: 0.687.

5 sentence accuracy for GI tagged reports...
LR accuracy: 0.742.
NB accuracy: 0.802.
SVM accuracy: 0.781.
SGD accuracy: 0.777.

All sentence accuracy for GI tagged reports...
LR accuracy: 0.794.
NB accuracy: 0.762.
SVM accuracy: 0.675.
SGD accuracy: 0.721.
