# Implementing logistic regression on data without tuning

In [24]:
# Import the pandas package, then use the "read_csv" function to read
# the labeled training data
import pandas as pd  
import re
from nltk.stem import WordNetLemmatizer 
train = pd.read_csv(r"C:\Users\usrer\Downloads\word2vec-nlp-tutorial\labeledTrainData.tsv", header=0, \
                    delimiter="\t", quoting=3)
test = pd.read_csv(r"C:\Users\usrer\Downloads\word2vec-nlp-tutorial\testData.tsv", header=0, delimiter="\t", quoting=3 )
unlabeled_train = pd.read_csv( r"C:\Users\usrer\Downloads\word2vec-nlp-tutorial\unlabeledTrainData.tsv", header=0, 
 delimiter="\t", quoting=3 )
# Import BeautifulSoup into your workspace
from bs4 import BeautifulSoup             
from nltk.corpus import stopwords
def review_to_wordlist( raw_review, remove_stopwords=None ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[\d+]", " ", review_text) 
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    
    
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    

    # Init the Wordnet Lemmatizer
    lemmatizer = WordNetLemmatizer()
    lemm=[lemmatizer.lemmatize(w) for w in meaningful_words]
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return ( " ".join( lemm ))  



In [25]:
num_reviews = train["review"].size
clean_train=[]
for i in range(0,num_reviews):
    clean_train.append(review_to_wordlist(train['review'][i]))
# Get the number of reviews based on the dataframe column size

In [26]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


X = clean_train
y = train['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

logreg=LogisticRegression(penalty='l2')
model=TfidfVectorizer(stop_words='english')

tfidf_train=model.fit_transform(X_train)
tfidf_test=model.transform(X_test)



logreg.fit(tfidf_train,y_train)
y_pred=logreg.predict(tfidf_test)

print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred))  



[[2131  308]
 [ 270 2291]]
              precision    recall  f1-score   support

           0       0.89      0.87      0.88      2439
           1       0.88      0.89      0.89      2561

   micro avg       0.88      0.88      0.88      5000
   macro avg       0.88      0.88      0.88      5000
weighted avg       0.88      0.88      0.88      5000

0.8844


In [None]:
logreg.get_params()

In [None]:
# Import necessary modules
from sklearn.model_selection import GridSearchCV

# Setup the hyperparameter grid
param_grid = {'penalty':['l1','l2'],
                  "C": [0.01, 0.1, 1, 10, 100]}

logreg = LogisticRegression()

# Instantiate the GridSearchCV object: logreg_cv
logreg_cv = GridSearchCV(logreg, param_grid=param_grid, cv=5, scoring='accuracy')

logreg_cv.fit(X_train,y_train)

# Print the tuned parameters and score
print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_)) 
print("Best score is {}".format(logreg_cv.best_score_))
