In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV

from xgboost import XGBClassifier

import time

In [5]:
stop_words = set(stopwords.words('english'))

In [4]:
# stop_words = [
#     'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'their', 'they',
#     'his', 'her', 'she', 'he', 'a', 'an', 'and', 'is', 'was', 'are', 'were', 'him', 'himself', 'has', 'have',
#     'it', 'its', 'the', 'us'
# ]

In [69]:
start = time.time()

train_url = 'proj3_data/split_1/train.tsv'
test_url = 'proj3_data/split_1/test.tsv'
test_y_url = 'proj3_data/split_1/test_y.tsv'
train = pd.read_csv(train_url, sep='\t', header=0, dtype=str)
test =  pd.read_csv(test_url, sep='\t', header=0, dtype=str)
train['review'] = train['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)
test['review'] = test['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)
test_y = pd.read_csv(test_y_url, sep='\t', header=0,dtype=str)
test = test.merge(test_y, on = 'id')

tot = pd.DataFrame()
tot['review'] = pd.concat([train['review'],test['review']])
tot['sentiment'] = pd.concat([train['sentiment'],test['sentiment']])

vectorizer = TfidfVectorizer(
    preprocessor=lambda x: x.lower(),  # Convert to lowercase
    stop_words=stop_words,             # Remove stop words
    ngram_range=(1, 4),               # Use 1- to 4-grams
    min_df=0.001,                        # Minimum term frequency
    max_df=0.5,                       # Maximum document frequency
    token_pattern=r"\b[\w+\|']+\b" # Use word tokenizer: See Ethan's comment below
)

dtm_tot = vectorizer.fit_transform(tot['review'])
feature_names = vectorizer.get_feature_names_out()
dtm_train=dtm_tot[:25000,:]
logreg = LogisticRegression(C=1.01,
                            max_iter=10000, 
                            solver='liblinear',penalty='l1').fit(dtm_train, train['sentiment'])
selected_features = feature_names[logreg.coef_[0]!=0]
vocabulary_dict = {feature: idx for idx, feature in enumerate(selected_features)}
end = time.time()
end - start

262.00023889541626

In [7]:
vectorizer = TfidfVectorizer(
    preprocessor=lambda x: x.lower(),  # Convert to lowercase
    stop_words=stop_words,             # Remove stop words
    ngram_range=(1, 4),               # Use 1- to 4-grams
    min_df=0.001,                        # Minimum term frequency
    max_df=0.5,                       # Maximum document frequency
    token_pattern=r"\b[\w+\|']+\b" # Use word tokenizer: See Ethan's comment below
)

In [8]:
dtm_tot = vectorizer.fit_transform(tot['review'])

In [9]:
feature_names = vectorizer.get_feature_names_out()

In [10]:
feature_names.shape

(17549,)

In [11]:
dtm_tot

<50000x17549 sparse matrix of type '<class 'numpy.float64'>'
	with 5256129 stored elements in Compressed Sparse Row format>

In [52]:
dtm_train=dtm_tot[:25000,:]

In [13]:
logreg = LogisticRegression(C=1.01,max_iter=10000, solver='liblinear',penalty='l1').fit(dtm_train, train['sentiment'])

In [14]:
np.count_nonzero(logreg.coef_)

996

In [15]:
selected_features = feature_names[logreg.coef_[0]!=0]

In [16]:
vocabulary_dict = {feature: idx for idx, feature in enumerate(selected_features)}

In [17]:
len(vocabulary_dict)

996

In [18]:
auc_scorer = make_scorer(roc_auc_score, greater_is_better=True)

In [70]:
aucs = []
processing_times = []
start = time.time()
for i in range(1,6):    
    train_url = f'proj3_data/split_{i}/train.tsv'
    test_url = f'proj3_data/split_{i}/test.tsv'
    test_y_url = f'proj3_data/split_{i}/test_y.tsv'
    
    train = pd.read_csv(train_url, sep='\t', header=0, dtype=str)
    test =  pd.read_csv(test_url, sep='\t', header=0, dtype=str)
    train['review'] = train['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)
    test['review'] = test['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)
    test_y = pd.read_csv(test_y_url, sep='\t', header=0,dtype=str)
    test = test.merge(test_y, on = 'id')
    
    file_path = 'myvocab.txt'
    with open(file_path, 'r') as file:
        lines = file.readlines()
    vocabulary_dict_loaded = {}
    for line in lines:
        # Split each line into key and value
        key, value = line.strip().split(': ')
        # Convert the value to an integer (assuming the values are integers)
        vocabulary_dict_loaded[key] = int(value)
    
    new_vectorizer = TfidfVectorizer(
        vocabulary = vocabulary_dict_loaded,
        preprocessor=lambda x: x.lower(),  # Convert to lowercase
        stop_words=stop_words,             # Remove stop words
        ngram_range=(1, 2),               # Use 1- to 4-grams
        min_df=0.001,                        # Minimum term frequency
        max_df=0.5,                       # Maximum document frequency
        token_pattern=r"\b[\w+\|']+\b" # Use word tokenizer: See Ethan's comment below
    )

    
    dtm_train = new_vectorizer.fit_transform(train['review'])   
    dtm_test = new_vectorizer.transform(test['review'])
    
#    param_grid = {
#    'learning_rate': [0.2, 0.3,0.4],
#    'n_estimators': [500,700,1000]
#    'learning_rate': [0.01, 0.1, 0.2],
#    'min_child_weight': [1, 3, 5],
#    'subsample': [0.8, 0.9, 1.0],
#    'colsample_bytree': [0.8, 0.9, 1.0],
#    'gamma': [0, 0.1, 0.2]
#    }
#    grid_search = GridSearchCV(XGBClassifier(
#                                            max_depth = 5,
#                                             n_estimators = 1000,
#                                             learning_rate = 0.4,
#                                             use_label_encoder=False, 
#                                             min_child_weight = 5,
#                                             subsample = 0.8,
#                                             colsample_bytree = 1.0,                                            
#                                              eval_metric='logloss', 
#                                              objective='binary:logistic'),
#                            param_grid=param_grid,
#                            cv=5,
#                            scoring=auc_scorer,
#                            n_jobs=-1)
#     grid_search.fit(dtm_train, train['sentiment'].astype(int))
#     best_params = grid_search.best_params_
#     best_model = grid_search.best_estimator_
#     proba_positive_class = best_model.predict_proba(dtm_test)[:,1]
    
    xgb_clf = XGBClassifier(
                            max_depth = 4,
                            n_estimators = 500,
                            learning_rate = 0.2,
                            use_label_encoder=False, 
                            min_child_weight = 6,
                            eval_metric='logloss', 
                            objective='binary:logistic').fit(dtm_train, train['sentiment'].astype(int))
    
    logreg_cv = LogisticRegressionCV(solver='liblinear').fit(dtm_train, train['sentiment'])
    
    proba_positive_class_1 = logreg_cv.predict_proba(dtm_test)[:,1]
    proba_positive_class_2 = xgb_clf.predict_proba(dtm_test)[:,1]
    auc = roc_auc_score(test['sentiment'], 0.7*proba_positive_class_1+0.3*proba_positive_class_2)
#    auc = roc_auc_score(test['sentiment'], proba_positive_class)
    aucs.append(round(auc,3))
    
    end = time.time()
    processing_times.append(round(end-start,2))
    
    print('split_%i is processed'%i)
    print('time eclapsed: %0.2f seconds'%(end - start))
    print('split_%i auc score %0.4f'%(i,auc))
    print('------------------')
    start = time.time()

split_1 is processed
time eclapsed: 36.56 seconds
split_1 auc score 0.9601
------------------
split_2 is processed
time eclapsed: 32.73 seconds
split_2 auc score 0.9623
------------------
split_3 is processed
time eclapsed: 35.31 seconds
split_3 auc score 0.9623
------------------
split_4 is processed
time eclapsed: 34.97 seconds
split_4 auc score 0.9628
------------------
split_5 is processed
time eclapsed: 38.76 seconds
split_5 auc score 0.9628
------------------


In [71]:
aucs

[0.96, 0.962, 0.962, 0.963, 0.963]

In [72]:
processing_times

[36.56, 32.73, 35.31, 34.97, 38.76]

In [55]:
file_path = 'myvocab.txt'

# Write the dictionary to the text file
with open(file_path, 'w') as file:
    for word, index in vocabulary_dict.items():
        file.write(f'{word}: {index}\n')

In [56]:
file_path = 'myvocab.txt'

with open(file_path, 'r') as file:
    lines = file.readlines()

vocabulary_dict_loaded = {}
for line in lines:
    # Split each line into key and value
    key, value = line.strip().split(': ')
    # Convert the value to an integer (assuming the values are integers)
    vocabulary_dict_loaded[key] = int(value)

In [58]:
len(vocabulary_dict_loaded)

996

In [64]:
mypred = pd.DataFrame()
mypred['id'] = test['id']
mypred['prob'] = 0.7*proba_positive_class_1+0.3*proba_positive_class_2

file_path = 'mypred.csv'
test_pred.to_csv(file_path, index=False)

In [65]:
mypred

Unnamed: 0,id,prob
0,40625,0.950597
1,14191,0.581868
2,5011,0.049003
3,23277,0.891420
4,29766,0.000079
...,...,...
24995,35113,0.107084
24996,20566,0.044944
24997,21602,0.000550
24998,11034,0.640018


In [66]:
test

Unnamed: 0,id,review,sentiment,score
0,40625,The first and second seasons started off shaki...,1,10
1,14191,"As Americans, we have come to expect crapiness...",0,3
2,5011,"PERHAPS SPOILER !! well, i ve seen it at the f...",0,2
3,23277,This was one of my favorites as a child. My fa...,1,8
4,29766,I feel it is my duty as a lover of horror film...,0,1
...,...,...,...,...
24995,35113,1st watched 8/3/2003 - 2 out of 10(Dir-Brad Sy...,0,2
24996,20566,"\Glen or Glenda\"" was Edward D. Wood Jr's firs...",0,1
24997,21602,I've seen a lot of bad movies in my life. Date...,0,1
24998,11034,Matt Cvetic is a loyal communist in a Pittsbur...,0,4
