In [35]:
import pandas as pd
import pickle
import string
import random
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [36]:
eng_stopwords = stopwords.words('english')
punctuation_list = string.punctuation
wnl = WordNetLemmatizer()

def remove_stopwords(w_list):
     return [word for word in w_list if word not in eng_stopwords]

def remove_punctuation(w_list):
     return [word for word in w_list if word not in punctuation_list]

def remove_number(w_list):
     return [word for word in w_list if word.isalpha()]

def get_pos_tag(tag):
     if tag == 'jj':
          return 'a'
     elif tag in ['nn', 'rb', 'vb']:
          return tag[0]
     else:
          return None

def lemmatizing_words(w_list):
     lemmatized = []
     tagging = pos_tag(w_list)
     for word, tag in tagging:
          label = get_pos_tag(tag)
          if label != None:
               lemmatized.append(wnl.lemmatize(word, label))
          else:
               lemmatized.append(wnl.lemmatize(word))
     return lemmatized

In [37]:
def preprocess_text(text):
    words = word_tokenize(text)
    words = remove_stopwords(words)
    words = remove_punctuation(words)
    words = remove_number(words)
    words = lemmatizing_words(words)
    return ' '.join(words)

In [38]:
def training_model():
     dataset = pd.read_csv('./Twitter_Data.csv').sample(n=1000)

     comments_list = dataset['clean_text'].apply(preprocess_text).tolist()
     label_list = dataset['category'].tolist()

     vectorizer = TfidfVectorizer()
     word_tfidf = vectorizer.fit_transform(comments_list)

     X_train, X_test, y_train, y_test = train_test_split(word_tfidf, label_list, test_size=0.2, random_state=42)
     random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
     random_forest.fit(X_train, y_train)

     y_pred = random_forest.predict(X_test)

     accuracy = accuracy_score(y_test, y_pred)
     report = classification_report(y_test, y_pred)

     print("Accuracy:", accuracy)
     print("Classification Report:\n", report)

     file = open('random_forest.pickle','wb')
     pickle.dump(random_forest, file)
     file.close()

     file = open('tfidf_random_forest.pickle','wb')
     pickle.dump(vectorizer, file)
     file.close()

     return random_forest, vectorizer
     

In [39]:
# try:
#      print("Load model...")
#      file = open('random_forest.pickle', 'rb')
#      model = pickle.load(file)
#      file.close()
#      file = open('tfidf_random_forest.pickle', 'rb')
#      vectorizer = pickle.load(file)
#      file.close()
# except FileNotFoundError:
#      print("No Model...")
model, vectorizer = training_model()

Accuracy: 0.535
Classification Report:
               precision    recall  f1-score   support

        -1.0       0.50      0.06      0.11        47
         0.0       0.57      0.79      0.66        71
         1.0       0.51      0.59      0.54        82

    accuracy                           0.54       200
   macro avg       0.52      0.48      0.44       200
weighted avg       0.53      0.54      0.48       200

