In [1]:
import pandas as pd
data = pd.read_csv('fb_sentiment.csv')

In [2]:
#some data analysis
columns = data.columns
data.drop(['Unnamed: 0'], axis=1, inplace=True)

In [3]:
print(data['Label'].unique(), data['Label'].isna().sum(), data['FBPost'].isna().sum())
print(data.shape)
#no missing values

['O' 'N' 'P'] 0 0
(1000, 2)


In [4]:
def encode(label):
  if label == 'N':
    return -1
  if label == 'P':
    return 1
  return 0
data['target'] = data['Label'].apply(encode)

In [5]:
#first, let's try Tfidf vectorizer with some linear classifiers as the standart approach
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data['FBPost'], data['target'], test_size=0.2)
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)

In [6]:
train_vectors.shape, test_vectors.shape

((800, 2569), (200, 2569))

In [7]:
#let's build different boosters and LogisticRegression now and compare them
#As our dataset is really small (as well as number of words for vectorizer)
#we can search for parameters even with iterating through each pair of lr, n_est
from sklearn.metrics import accuracy_score
def score_model(model):
  model.fit(train_vectors, y_train + 1)
  predictions = model.predict(test_vectors)
  return accuracy_score(predictions, y_test + 1)

In [8]:
import lightgbm
scores_lgb = []
for lr in range(1, 40):
  scores_row = []
  for n_estimators in range(1, 40, 1):
    model_lgb = lightgbm.LGBMClassifier(
        learning_rate=lr*0.02, 
        n_estimators=n_estimators
    )
    scores_row.append(score_model(model_lgb))
  scores_lgb.append(scores_row)

In [9]:
scores_lgbm_df = pd.DataFrame(
  scores_lgb, 
  columns=list(range(1, 40, 1)), 
  index=list(range(1, 40))
)

In [10]:
import xgboost
scores_xgb = []
for lr in range(1, 40):
  scores_row = []
  for n_estimators in range(1, 40, 1):
    model_xgb = xgboost.XGBClassifier(
        learning_rate=lr*0.02, 
        n_estimators=n_estimators
    )
    scores_row.append(score_model(model_xgb))
  scores_xgb.append(scores_row)

In [11]:
scores_xgboost_df = pd.DataFrame(
  scores_xgb, 
  columns=list(range(1, 40, 1)), 
  index=list(range(1, 40))
)


In [12]:
from sklearn.linear_model import LogisticRegression
print(score_model(LogisticRegression()))

0.775


In [13]:
#now choose optimal parameters for each model
import numpy as np
optimal_lgb = scores_lgbm_df.max()
optimal_xgb = scores_xgboost_df.max()
print(f"logreg score={score_model(LogisticRegression())}")
print(f"Score for LGBMCLassifier is {max(optimal_lgb.values)}")
print(f"Score for XGBCLassifier is {max(optimal_xgb.values)}")

logreg score=0.775
Score for LGBMCLassifier is 0.815
Score for XGBCLassifier is 0.815


In [14]:
#it seems that for these classifiers performance isn't bad at all 
#now we'll get values for lr, n_est
max_est_lgb = optimal_lgb.idxmax()
max_lr_lgb = scores_lgbm_df[max_est_lgb].idxmax()
max_est_xgb = optimal_xgb.idxmax()
max_lr_xgb = scores_xgboost_df[max_est_xgb].idxmax()

In [15]:
print(max_est_lgb, max_lr_lgb*0.02)
print(max_est_xgb, max_lr_xgb*0.02)

28 0.18
33 0.4


In [16]:
#now we'll do some preprocessing and try again this approach
#score is lower with stemmer, won't use it, although it can be useful in other cases
#from nltk.stem import PorterStemmer

In [17]:
#stemmer = PorterStemmer()
data['FBPost'] = data['FBPost'].str.replace('[^A-z]', ' ', regex=True)
data['FBPost'] = data['FBPost'].str.replace('[ ]+', ' ', regex=True)

In [18]:
#data['FBPost'] = pd.Series(
#    [' '.join([stemmer.stem(word) for word in text.split()]) for text in data['FBPost'].values]
#)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(data['FBPost'], data['target'], test_size=0.2)
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)

In [20]:
scores_lgb = []
for lr in range(1, 40):
  scores_row = []
  for n_estimators in range(1, 40, 1):
    model_lgb = lightgbm.LGBMClassifier(
        learning_rate=lr*0.02, 
        n_estimators=n_estimators
    )
    scores_row.append(score_model(model_lgb))
  scores_lgb.append(scores_row)
scores_lgbm_df = pd.DataFrame(
  scores_lgb, 
  columns=list(range(1, 40, 1)), 
  index=list(range(1, 40))
)

In [21]:
scores_xgb = []
for lr in range(1, 40):
  scores_row = []
  for n_estimators in range(1, 40, 1):
    model_xgb = xgboost.XGBClassifier(
        learning_rate=lr*0.02, 
        n_estimators=n_estimators
    )
    scores_row.append(score_model(model_xgb))
  scores_xgb.append(scores_row)
scores_xgboost_df = pd.DataFrame(
  scores_xgb, 
  columns=list(range(1, 40, 1)), 
  index=list(range(1, 40))
)


In [22]:
print(score_model(LogisticRegression()))
optimal_lgb = scores_lgbm_df.max()
optimal_xgb = scores_xgboost_df.max()
print(f"logreg score={score_model(LogisticRegression())}")
print(f"Score for LGBMCLassifier is {max(optimal_lgb.values)}")
print(f"Score for XGBCLassifier is {max(optimal_xgb.values)}")
max_est_lgb = optimal_lgb.idxmax()
max_lr_lgb = scores_lgbm_df[max_est_lgb].idxmax()
max_est_xgb = optimal_xgb.idxmax()
max_lr_xgb = scores_xgboost_df[max_est_xgb].idxmax()
print(max_est_lgb, max_lr_lgb*0.02)
print(max_est_xgb, max_lr_xgb*0.02)

0.69
logreg score=0.69
Score for LGBMCLassifier is 0.755
Score for XGBCLassifier is 0.82
7 0.5
11 0.56


In [23]:
#overall performance is the same, but now xgboost is better

In [24]:
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(data['FBPost'], data['target'], test_size=0.2)
vectorizer_reg = TfidfVectorizer()
train_vectors_reg = vectorizer_reg.fit_transform(X_train_reg)
test_vectors_reg = vectorizer_reg.transform(X_test_reg)
def score_regressor(model):
  model.fit(train_vectors_reg, y_train_reg)
  preds = model.predict(test_vectors_reg)
  return np.linalg.norm(preds - np.array(y_test_reg.values))

In [25]:
scores_xgb = []
for lr in range(1, 40):
  scores_row = []
  for n_estimators in range(5, 100, 5):
    model_xgb = xgboost.XGBRegressor(
        learning_rate=lr*0.02, 
        n_estimators=n_estimators
    )
    scores_row.append(score_regressor(model_xgb))
  scores_xgb.append(scores_row)


In [26]:
scores_xgboost_df = pd.DataFrame(
  scores_xgb, 
  columns=list(range(5, 100, 5)), 
  index=list(range(1, 40))
)

In [27]:
optimal_xgb_reg = scores_xgboost_df.min()
print(f"Score for XGBCLassifier is {min(optimal_xgb_reg.values)}")
min_est_xgb = optimal_xgb_reg.idxmin()
min_lr_xgb = scores_xgboost_df[min_est_xgb].idxmin()
print(min_est_xgb, min_lr_xgb*0.02)

Score for XGBCLassifier is 6.78627151318999
90 0.16


In [28]:
#next, we'll try to build a classifier for regressor

In [29]:
final_regressor = xgboost.XGBRegressor(learning_rate=0.52, n_estimators=90)
final_vectorizer = TfidfVectorizer()
vectors = final_vectorizer.fit_transform(data['FBPost'])
final_regressor.fit(vectors, data['target'])

In [30]:
new_train = final_regressor.predict(vectors)
extra_classifier = xgboost.XGBClassifier()
X_train_extra, X_test_extra, y_train_extra, y_test_extra = train_test_split(new_train, data['target'], test_size=0.2)
extra_classifier.fit(X_train_extra.reshape((-1,1)), y_train_extra + 1)
preds = extra_classifier.predict(X_test_extra.reshape((-1,1)))
print(accuracy_score(preds, y_test_extra + 1))
#not bad at all
#let's try this for train_test_split on whole dataset, as this model seems to be overfitted

0.995


In [31]:
X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(data['FBPost'], data['target'], test_size=0.2)
vectorizer_final = TfidfVectorizer()
train_vectors_final = vectorizer_reg.fit_transform(X_train_final)
test_vectors_final = vectorizer_reg.transform(X_test_final)

In [32]:
final_regressor = xgboost.XGBRegressor(learning_rate=0.52, n_estimators=90)
final_regressor.fit(train_vectors_final, y_train_final)
new_train_final = final_regressor.predict(train_vectors_final)
extra_classifier = xgboost.XGBClassifier(learning_rate=0.1, n_estimators=100)
extra_classifier.fit(new_train_final.reshape((-1,1)), y_train_final + 1)
new_test = final_regressor.predict(test_vectors_final)
preds = extra_classifier.predict(new_test.reshape((-1,1)))
print(accuracy_score(preds, y_test_final + 1))
#test performance not that good


0.745


In [33]:
#we also can try to find a, b tresholds
def apply_tresholds(val):
  if val <= a / 100:
    return -1
  if val <= b / 100:
    return 0
  return 1
scores_ab = []
for a in range(-100, 100, 1):
  row = []
  for b in range(-100, 100, 1):
    preds_categorical = np.array([apply_tresholds(val) for val in new_test])
    row.append(accuracy_score(preds_categorical, y_test_final))
  scores_ab.append(row)
scores_ab_df = pd.DataFrame(
  scores_ab, 
  columns=list(range(-100, 100, 1)),
  index=list(range(-100, 100, 1))
)
print(max(scores_ab_df.max()))
#this is still worse than classifier

0.765
