# Text classification wofkflow

## 1. Loading data

In [None]:

import sys
sys.path.append("..")
from common.download_utils import download_week1_resources

download_week1_resources()

## 2. Text preprocessing 

In [None]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download()


nltk.download('stopwords')
from nltk.corpus import stopwords

## 3. Getting ready everything for the task. 

In this task you will deal with a dataset of post titles from StackOverflow. You are provided a split to 3 sets: train, validation and test. All corpora (except for test) contain titles of the posts and corresponding tags (100 tags are available). Upload the corpora using pandas and look at the data:

In [None]:
from ast import literal_eval
import pandas as pd
import numpy as np

In [None]:
def read_data(filename):
    data = pd.read_csv(filename, sep='\t')
    data['tags'] = data['tags'].apply(literal_eval)
    return data

In [None]:
train = read_data('data/train.tsv')
validation = read_data('data/validation.tsv')
test = pd.read_csv('data/test.tsv', sep='\t')

In [None]:
train.head()

In [19]:
X_train, y_train = train['title'].values, train['tags'].values
X_val, y_val = validation['title'].values, validation['tags'].values
X_test = test['title'].values

### For each tag and for each word calculate how many times they occur in the train corpus.

In [20]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer

In [21]:

# Dictionary of all tags from train corpus with their counts.
tags_counts = {}
# Dictionary of all words from train corpus with their counts.
words_counts = {}

#First doing it for tags
import collections
cnt = collections.Counter()
for string in y_train:
    for line in string:
        cnt[line]+=1


tags_counts = {}
for key, value in cnt.items():
    tags_counts[key] = value

#Second, doing it for words
cv = CountVectorizer(token_pattern=u"(?u)\\b\\w+\\b")
cv_fit=cv.fit_transform(X_train)
Words=cv.get_feature_names()
Frequencies=cv_fit.toarray().sum(axis=0)
Attempt=dict(zip(Words, Frequencies))
words_counts=Attempt
words_counts2 = sorted(Attempt.items(), key=lambda kv: -kv[1])


#Doing it with counters:
tokenizer1=nltk.tokenize.WhitespaceTokenizer()
TokenTexto=tokenizer1.tokenize(X_train[0])

#Doing it with counters:
cnt = collections.Counter()
for string in X_train:
    TokenTexto=tokenizer1.tokenize(string)
    for string in TokenTexto:
        cnt[string]+=1



words_counts = {}
for key, value in cnt.items():
    words_counts[key] = value

In [None]:
#Obtaining the most common words
#Most common words

most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:5000]
print(most_common_words)

most_commonN = {}
indextowords={}
key=0
for pair in most_common_words:
    #print(pair[0])
    most_commonN[pair[0]] = key
    indextowords[key]=pair[0]
    key+=1
#print(indextowords)
#print(most_commonN)

WORDS_TO_INDEX = most_common

print(WORDS_TO_INDEX)

## 4. Prepare the text data

One of the most known difficulties when working with natural data is that it's unstructured. For example, if you use it "as is" and extract tokens just by splitting the titles by whitespaces, you will see that there are many "weird" tokens like 3.5?, "Flip, etc. To prevent the problems, it's usually useful to prepare the data somehow. In this task you'll write a function, which will be also used in the other assignments.



In [23]:
import re

In [24]:
#Input necessary for the bag of words:
DICT_SIZE = 5000
WORDS_TO_INDEX = most_commonN
INDEX_TO_WORDS = indextowords
ALL_WORDS = WORDS_TO_INDEX.keys()

In [25]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def text_prepare(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() #Lowercase text 
    text = re.sub(REPLACE_BY_SPACE_RE, " ", text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text  = re.sub(BAD_SYMBOLS_RE,"", text)# delete symbols which are in BAD_SYMBOLS_RE from text
    # delete stopwords from text
    pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
    text= pattern.sub('', text) 
    text = re.sub(' +',' ',text) #Remove extra
    return text

In [26]:
# Now we can preprocess the titles using function text_prepare and making sure 
#that the headers don't have bad symbols:

X_train = [text_prepare(x) for x in X_train]
X_val = [text_prepare(x) for x in X_val]
X_test = [text_prepare(x) for x in X_test]



### Multilabel classifier

In [28]:
from sklearn.preprocessing import MultiLabelBinarizer

In [29]:
mlb = MultiLabelBinarizer(classes=sorted(tags_counts.keys()))
y_train = mlb.fit_transform(y_train)
y_val = mlb.fit_transform(y_val)

## 4 Transforming text to a vector

Machine Learning algorithms work with numeric data and we cannot use the provided text data "as is". There are many ways to transform text data to numeric vectors. In this task you will try to use two of them.

#### Bag of words

One of the well-known approaches is a *bag-of-words* representation. To create this transformation, follow the steps:
1. Find *N* most popular words in train corpus and numerate them. Now we have a dictionary of the most popular words.
2. For each title in the corpora create a zero vector with the dimension equals to *N*.
3. For each text in the corpora iterate over words which are in the dictionary and increase by 1 the corresponding coordinate.

Let's try to do it for a toy example. Imagine that we have *N* = 4 and the list of the most popular words is 

    ['hi', 'you', 'me', 'are']

Then we need to numerate them, for example, like this: 

    {'hi': 0, 'you': 1, 'me': 2, 'are': 3}

And we have the text, which we want to transform to the vector:

    'hi how are you'

For this text we create a corresponding zero vector 

    [0, 0, 0, 0]
    
And iterate over all words, and if the word is in the dictionary, we increase the value of the corresponding position in the vector:

    'hi':  [1, 0, 0, 0]
    'how': [1, 0, 0, 0] # word 'how' is not in our dictionary
    'are': [1, 0, 0, 1]
    'you': [1, 1, 0, 1]

The resulting vector will be 

    [1, 1, 0, 1]
   
Implement the described encoding in the function *my_bag_of_words* with the size of the dictionary equals to 5000. To find the most common words use train data. You can test your code using the function *test_my_bag_of_words*.

In [30]:
#Function my_bag_of_words

def my_bag_of_words(text, words_to_index, dict_size):
    """
        text: a string
        dict_size: size of the dictionary
        
        return a vector which is a bag-of-words representation of 'text'
    """
    l=list()
    tokenizer1=nltk.tokenize.WhitespaceTokenizer()
    TokenTexto=tokenizer1.tokenize(text)

    SizeN=len(TokenTexto)
    counterP=0
    P = [0] * len(words_to_index)
    #List of word to index:

    palabras=list(words_to_index)[0:dict_size]
    for popular in words_to_index:
        ct=0
        for word in TokenTexto:
            
            if popular==word:
                ct+=1
                P[counterP]=ct
        counterP+=1

    
    #print(P)
    l.append(P)
    #print(l)
    result_vector=l
    return P

In [None]:
#Example:
words_to_index = {'hi': 0, 'you': 1, 'me': 2, 'are': 3}
examples = ['hi how are you']
answers = [[1, 1, 0, 1]]

for ex, ans in zip(examples, answers):
    print("ex is "+str(ex)+" ans is "+str(ans))
    print("my bag of words output is"+str(my_bag_of_words(ex, words_to_index, 4)))
    print("answer is....."+str(ans))

In [32]:
#Sparse matrix
from scipy import sparse as sp_sparse

In [None]:
X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train])
X_val_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_val])
X_test_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_test])
print('X_train shape ', X_train_mybag.shape)
print('X_val shape ', X_val_mybag.shape)
print('X_test shape ', X_test_mybag.shape)

### TF-IDF 

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

In [35]:
def tfidf_features(X_train, X_val, X_test):
    """
        X_train, X_val, X_test — samples        
        return TF-IDF vectorized representation of each sample and vocabulary
    """
    # Create TF-IDF vectorizer with a proper parameters choice
    # Fit the vectorizer on the train set
    # Transform the train, test, and val sets and return the result
    
    
    #tfidf_vectorizer = ####### YOUR CODE HERE ####### \S+
    tfidf_vectorizer = TfidfVectorizer(min_df=0.05, max_df=0.95, ngram_range=(1, 2),token_pattern='\S+')
    X_train=tfidf_vectorizer.fit_transform(X_train)
    X_val=tfidf_vectorizer.transform(X_val)
    X_test=tfidf_vectorizer.transform(X_test)

    
    return X_train, X_val, X_test, tfidf_vectorizer.vocabulary_

In [36]:
#Defining the training
def train_classifier(X_train, y_train):

    #X_train_tfidf, X_val_tfidf, X_test_tfidf, tfidf_vocab = tfidf_features(X_train, X_val, X_test)
    ovr2 = OneVsRestClassifier(LogisticRegression(penalty='l2', C=100.0, ))
    ovr2.fit(X_train,y_train)
    return(ovr2)
    
  

In [37]:
def print_evaluation_scores(y_val, predicted):
    print(accuracy_score(y_val, predicted))
    print(f1_score(y_val, predicted, average='macro'))
    print(f1_score(y_val, predicted, average='micro'))
    print("f1 is "+str(f1_score(y_val, predicted, average='weighted')))
    
    print(average_precision_score(y_val, predicted, average='macro'))
    print(average_precision_score(y_val, predicted, average='micro'))
    print(average_precision_score(y_val, predicted, average='weighted'))

## Performing the Training

### 1. Doing the transformation with TFIDF:

In [38]:
X_train_tfidf, X_val_tfidf, X_test_tfidf, tfidf_vocab = tfidf_features(X_train, X_val, X_test)
tfidf_reversed_vocab = {i:word for word,i in tfidf_vocab.items()}

### 2. Training 

In [None]:
classifier_mybag = train_classifier(X_train_mybag, y_train)
classifier_tfidf = train_classifier(X_train_tfidf, y_train)

### 3. Predicting

In [40]:
y_val_predicted_labels_mybag = classifier_mybag.predict(X_val_mybag)
y_val_predicted_scores_mybag = classifier_mybag.decision_function(X_val_mybag)

y_val_predicted_labels_tfidf = classifier_tfidf.predict(X_val_tfidf)
y_val_predicted_scores_tfidf = classifier_tfidf.decision_function(X_val_tfidf)

In [None]:
print('Bag-of-words')
print_evaluation_scores(y_val, y_val_predicted_labels_mybag)
print('Tfidf')
print_evaluation_scores(y_val, y_val_predicted_labels_tfidf)

## 5.  Workflow for the entire model

In [42]:
def ModelPredictionAccuracy(X_train,X_val,X_test,X_train_mybag,X_val_mybag,X_test_mybag,
                            y_train,y_val,penaltyListed='l2',CListed=50,min_dfListed=0.1,max_dfListed=0.9,
                            ngram_rangeListed=(1, 2)):
    
    #First, obtain again the tfidf
    
    print("penalty is "+str(penaltyListed))
    print("C is "+str(CListed))
    print("max_df is "+str(max_dfListed))
    print("min_df is "+str(min_dfListed))
    print("ngram_range is "+str(ngram_rangeListed))

    
    
    tfidf_vectorizer = TfidfVectorizer(min_df=min_dfListed, max_df=max_dfListed, ngram_range=ngram_rangeListed,
                                       token_pattern='\S+')
    
    X_train_tfidf=tfidf_vectorizer.fit_transform(X_train)
    X_val_tfidf=tfidf_vectorizer.transform(X_val)
    X_test_tfidf=tfidf_vectorizer.transform(X_test)
    
    
    #Second, perform logistic regression
    Logistic_TFIDF = OneVsRestClassifier(LogisticRegression(penalty=penaltyListed, C=CListed,max_iter=500 ))
    Logistic_MYBAG = OneVsRestClassifier(LogisticRegression(penalty=penaltyListed, C=CListed,max_iter=500 ))
    
    Logistic_TFIDF.fit(X_train_tfidf,y_train)
    Logistic_MYBAG.fit(X_train_mybag,y_train)
    
    #Third, obtain predictions
    y_val_predicted_labels_tfidf = Logistic_TFIDF.predict(X_val_tfidf)
    y_val_predicted_scores_tfidf = Logistic_TFIDF.decision_function(X_val_tfidf)
    
    y_val_predicted_labels_mybag = Logistic_MYBAG.predict(X_val_mybag)
    y_val_predicted_scores_mybag = Logistic_MYBAG.decision_function(X_val_mybag)
    
    #Fourth, measuring accuracy, f1, etc. of predictions
    accuracy_tfidf=accuracy_score(y_val, y_val_predicted_labels_tfidf)
    f1_tfidf=f1_score(y_val, y_val_predicted_labels_tfidf, average='weighted')
    precision_tfidf=average_precision_score(y_val, y_val_predicted_labels_tfidf, average='weighted')
    
    accuracy_mybag=accuracy_score(y_val, y_val_predicted_labels_mybag)
    f1_mybag=f1_score(y_val, y_val_predicted_labels_mybag, average='weighted')
    precision_mybag=average_precision_score(y_val, y_val_predicted_labels_mybag, average='weighted')
    
    data={'Measure':['Accuracy','F1','Precision',
                     'Accuracy','F1','Precision'],
         'Value':[accuracy_tfidf,f1_tfidf,precision_tfidf,accuracy_mybag,f1_mybag,precision_mybag],
         'C value':[CListed,CListed,CListed,CListed,CListed,CListed],
         'Penalty':[penaltyListed,penaltyListed,penaltyListed,penaltyListed,penaltyListed,penaltyListed],
         'min_df':[min_dfListed,min_dfListed,min_dfListed,min_dfListed,min_dfListed,min_dfListed],
         'max_df':[max_dfListed,max_dfListed,max_dfListed,max_dfListed,max_dfListed,max_dfListed],
         'ngram_range':[ngram_rangeListed,ngram_rangeListed,ngram_rangeListed,ngram_rangeListed,ngram_rangeListed,ngram_rangeListed],
         'Model':['TFIDF','TFIDF','TFIDF','BOW','BOW','BOW']}
        
    
    df = pd.DataFrame(data)
    print(df)
    return df
    


In [43]:
#Doing the lists

Penalty_List=['l2','l1']
min_df_List=[0.02,0.05,0.1]
max_df_List=[0.6,0.7,0.8,0.9,1.0]
ngram_range_List=[(1, 2),(1,3)]
C_List=[1,2,5,10,20,100]

In [None]:
#Doing the loop
it=0
for CL in C_List:
    for penaltyL in Penalty_List:
        for min_dfL in min_df_List:
            for max_dfL in max_df_List:
                for ngram_rangeL in ngram_range_List:
                    Results=ModelPredictionAccuracy(X_train,X_val,X_test,X_train_mybag,X_val_mybag,X_test_mybag,
                                                     y_train,y_val,penaltyListed=penaltyL,CListed=CL,
                                                    min_dfListed=min_dfL,max_dfListed=max_dfL,
                                                    ngram_rangeListed=ngram_rangeL)
                    if it==0:
                        RESULTS2=Results
                    if it!=0:
                        RESULTS2=RESULTS2.append(Results)
                    it+=1
                    

In [None]:
#First, obtain again the tfidf
penaltyListed='l2'
CListed=1
min_dfListed=0.1
max_dfListed=0.6
ngram_rangeListed=(1, 2)


for C in [1]:    
    print("penalty is "+str(penaltyListed))
    print("C is "+str(CListed))
    print("max_df is "+str(max_dfListed))
    print("min_df is "+str(min_dfListed))
    print("ngram_range is "+str(ngram_rangeListed))

    
    
    tfidf_vectorizer = TfidfVectorizer(min_df=0.02, max_df=0.8, ngram_range=(1, 3),
                                       token_pattern='\S+')
    
    X_train_tfidf=tfidf_vectorizer.fit_transform(X_train)
    X_val_tfidf=tfidf_vectorizer.transform(X_val)
    X_test_tfidf=tfidf_vectorizer.transform(X_test)
    
    
    #Second, perform logistic regression
    Logistic_TFIDF = OneVsRestClassifier(LogisticRegression(penalty='l2', C=50,max_iter=500 ))
    Logistic_MYBAG = OneVsRestClassifier(LogisticRegression(penalty='l2', C=50,max_iter=500 ))
    
    Logistic_TFIDF.fit(X_train_tfidf,y_train)
    Logistic_MYBAG.fit(X_train_mybag,y_train)
    
    #Third, obtain predictions
    y_val_predicted_labels_tfidf = Logistic_TFIDF.predict(X_val_tfidf)
    y_val_predicted_scores_tfidf = Logistic_TFIDF.decision_function(X_val_tfidf)
    
    y_val_predicted_labels_mybag = Logistic_MYBAG.predict(X_val_mybag)
    y_val_predicted_scores_mybag = Logistic_MYBAG.decision_function(X_val_mybag)
    
    #Fourth, measuring accuracy, f1, etc. of predictions
    accuracy_tfidf=accuracy_score(y_val, y_val_predicted_labels_tfidf)
    f1_tfidf=f1_score(y_val, y_val_predicted_labels_tfidf, average='weighted')
    precision_tfidf=average_precision_score(y_val, y_val_predicted_labels_tfidf, average='weighted')
    
    accuracy_mybag=accuracy_score(y_val, y_val_predicted_labels_mybag)
    f1_mybag=f1_score(y_val, y_val_predicted_labels_mybag, average='weighted')
    precision_mybag=average_precision_score(y_val, y_val_predicted_labels_mybag, average='weighted')
    
    data={'Measure':['Accuracy','F1','Precision',
                     'Accuracy','F1','Precision'],
         'Value':[accuracy_tfidf,f1_tfidf,precision_tfidf,accuracy_mybag,f1_mybag,precision_mybag],
         'C value':[CListed,CListed,CListed,CListed,CListed,CListed],
         'Penalty':[penaltyListed,penaltyListed,penaltyListed,penaltyListed,penaltyListed,penaltyListed],
         'min_df':[min_dfListed,min_dfListed,min_dfListed,min_dfListed,min_dfListed,min_dfListed],
         'max_df':[max_dfListed,max_dfListed,max_dfListed,max_dfListed,max_dfListed,max_dfListed],
         'ngram_range':[ngram_rangeListed,ngram_rangeListed,ngram_rangeListed,ngram_rangeListed,ngram_rangeListed,ngram_rangeListed],
         'Model':['TFIDF','TFIDF','TFIDF','BOW','BOW','BOW']}
        
    
    df = pd.DataFrame(data)
    print(df)

In [None]:
print(RESULTS2)

In [None]:
for c in [1]:

    tfidf_vectorizer = TfidfVectorizer(min_df=min_dfListed, max_df=max_dfListed, ngram_range=ngram_rangeListed,
                                       token_pattern='\S+')
    
    X_train_tfidf=tfidf_vectorizer.fit_transform(X_train)
    X_val_tfidf=tfidf_vectorizer.transform(X_val)
    X_test_tfidf=tfidf_vectorizer.transform(X_test)
    
    
    #Second, perform logistic regression
    Logistic_TFIDF = OneVsRestClassifier(LogisticRegression(penalty=penaltyListed, C=CListed,max_iter=500 ))
    Logistic_MYBAG = OneVsRestClassifier(LogisticRegression(penalty=penaltyListed, C=CListed,max_iter=500 ))
    
    Logistic_TFIDF.fit(X_train_tfidf,y_train)
    Logistic_MYBAG.fit(X_train_mybag,y_train)
    
    #Third, obtain predictions
    y_val_predicted_labels_tfidf = Logistic_TFIDF.predict(X_val_tfidf)
    y_val_predicted_scores_tfidf = Logistic_TFIDF.decision_function(X_val_tfidf)
    
    y_val_predicted_labels_mybag = Logistic_MYBAG.predict(X_val_mybag)
    y_val_predicted_scores_mybag = Logistic_MYBAG.decision_function(X_val_mybag)
    
    #Fourth, measuring accuracy, f1, etc. of predictions
    accuracy_tfidf=accuracy_score(y_val, y_val_predicted_labels_tfidf)
    f1_tfidf=f1_score(y_val, y_val_predicted_labels_tfidf, average='weighted')
    precision_tfidf=average_precision_score(y_val, y_val_predicted_labels_tfidf, average='weighted')
    
    accuracy_mybag=accuracy_score(y_val, y_val_predicted_labels_mybag)
    f1_mybag=f1_score(y_val, y_val_predicted_labels_mybag, average='weighted')
    precision_mybag=average_precision_score(y_val, y_val_predicted_labels_mybag, average='weighted')
    
    data={'Measure':['Accuracy','F1','Precision',
                     'Accuracy','F1','Precision'],
         'Value':[accuracy_tfidf,f1_tfidf,precision_tfidf,accuracy_mybag,f1_mybag,precision_mybag],
         'C value':[CListed,CListed,CListed,CListed,CListed,CListed],
         'Penalty':[penaltyListed,penaltyListed,penaltyListed,penaltyListed,penaltyListed,penaltyListed],
         'min_df':[min_dfListed,min_dfListed,min_dfListed,min_dfListed,min_dfListed,min_dfListed],
         'max_df':[max_dfListed,max_dfListed,max_dfListed,max_dfListed,max_dfListed,max_dfListed],
         'ngram_range':[ngram_rangeListed,ngram_rangeListed,ngram_rangeListed,ngram_rangeListed,ngram_rangeListed,ngram_rangeListed],
         'Model':['TFIDF','TFIDF','TFIDF','BOW','BOW','BOW']}

In [None]:
for c in [1]:
    Logistic_MYBAG = OneVsRestClassifier(LogisticRegression(penalty='l1', C=1,max_iter=500 ))
    Logistic_MYBAG.fit(X_train_mybag,y_train) 
    y_val_predicted_labels_mybag = Logistic_MYBAG.predict(X_val_mybag)
    #Fourth, measuring accuracy, f1, etc. of predictions
    accuracy_mybag=accuracy_score(y_val, y_val_predicted_labels_mybag)
    f1_mybag=f1_score(y_val, y_val_predicted_labels_mybag, average='weighted')
    precision_mybag=average_precision_score(y_val, y_val_predicted_labels_mybag, average='weighted')
    print('-----')
    print('l1 c=50')
    print(accuracy_mybag)
    print(f1_mybag)
    print(precision_mybag)
    
    Logistic_MYBAG = OneVsRestClassifier(LogisticRegression(penalty='l1', C=5,max_iter=500 ))
    Logistic_MYBAG.fit(X_train_mybag,y_train) 
    y_val_predicted_labels_mybag = Logistic_MYBAG.predict(X_val_mybag)
    #Fourth, measuring accuracy, f1, etc. of predictions
    accuracy_mybag=accuracy_score(y_val, y_val_predicted_labels_mybag)
    f1_mybag=f1_score(y_val, y_val_predicted_labels_mybag, average='weighted')
    precision_mybag=average_precision_score(y_val, y_val_predicted_labels_mybag, average='weighted')
    print('-----')
    print('l1 c=50')
    print(accuracy_mybag)
    print(f1_mybag)
    print(precision_mybag)
    
    Logistic_MYBAG = OneVsRestClassifier(LogisticRegression(penalty='l1', C=10,max_iter=500 ))
    Logistic_MYBAG.fit(X_train_mybag,y_train) 
    y_val_predicted_labels_mybag = Logistic_MYBAG.predict(X_val_mybag)
    #Fourth, measuring accuracy, f1, etc. of predictions
    accuracy_mybag=accuracy_score(y_val, y_val_predicted_labels_mybag)
    f1_mybag=f1_score(y_val, y_val_predicted_labels_mybag, average='weighted')
    precision_mybag=average_precision_score(y_val, y_val_predicted_labels_mybag, average='weighted')
    print('-----')
    print('l1 c=50')
    print(accuracy_mybag)
    print(f1_mybag)
    print(precision_mybag)
    
    
    Logistic_MYBAG = OneVsRestClassifier(LogisticRegression(penalty='l1', C=20,max_iter=500 ))
    Logistic_MYBAG.fit(X_train_mybag,y_train) 
    y_val_predicted_labels_mybag = Logistic_MYBAG.predict(X_val_mybag)
    #Fourth, measuring accuracy, f1, etc. of predictions
    accuracy_mybag=accuracy_score(y_val, y_val_predicted_labels_mybag)
    f1_mybag=f1_score(y_val, y_val_predicted_labels_mybag, average='weighted')
    precision_mybag=average_precision_score(y_val, y_val_predicted_labels_mybag, average='weighted')
    print('-----')
    print('l1 c=50')
    print(accuracy_mybag)
    print(f1_mybag)
    print(precision_mybag)
    
    
    Logistic_MYBAG = OneVsRestClassifier(LogisticRegression(penalty='l1', C=50,max_iter=500 ))
    Logistic_MYBAG.fit(X_train_mybag,y_train) 
    y_val_predicted_labels_mybag = Logistic_MYBAG.predict(X_val_mybag)
    #Fourth, measuring accuracy, f1, etc. of predictions
    accuracy_mybag=accuracy_score(y_val, y_val_predicted_labels_mybag)
    f1_mybag=f1_score(y_val, y_val_predicted_labels_mybag, average='weighted')
    precision_mybag=average_precision_score(y_val, y_val_predicted_labels_mybag, average='weighted')
    print('-----')
    print('l1 c=50')
    print(accuracy_mybag)
    print(f1_mybag)
    print(precision_mybag)
    
    
    Logistic_MYBAG = OneVsRestClassifier(LogisticRegression(penalty='l1', C=100,max_iter=500 ))
    Logistic_MYBAG.fit(X_train_mybag,y_train) 
    y_val_predicted_labels_mybag = Logistic_MYBAG.predict(X_val_mybag)
    #Fourth, measuring accuracy, f1, etc. of predictions
    accuracy_mybag=accuracy_score(y_val, y_val_predicted_labels_mybag)
    f1_mybag=f1_score(y_val, y_val_predicted_labels_mybag, average='weighted')
    precision_mybag=average_precision_score(y_val, y_val_predicted_labels_mybag, average='weighted')
    print('-----')
    print('l1 c=50')
    print(accuracy_mybag)
    print(f1_mybag)
    print(precision_mybag)
    
    
    Logistic_MYBAG = OneVsRestClassifier(LogisticRegression(penalty='l2', C=1,max_iter=500 ))
    Logistic_MYBAG.fit(X_train_mybag,y_train) 
    y_val_predicted_labels_mybag = Logistic_MYBAG.predict(X_val_mybag)
    #Fourth, measuring accuracy, f1, etc. of predictions
    accuracy_mybag=accuracy_score(y_val, y_val_predicted_labels_mybag)
    f1_mybag=f1_score(y_val, y_val_predicted_labels_mybag, average='weighted')
    precision_mybag=average_precision_score(y_val, y_val_predicted_labels_mybag, average='weighted')
    print('-----')
    print('l1 c=50')
    print(accuracy_mybag)
    print(f1_mybag)
    print(precision_mybag)
    
    Logistic_MYBAG = OneVsRestClassifier(LogisticRegression(penalty='l2', C=5,max_iter=500 ))
    Logistic_MYBAG.fit(X_train_mybag,y_train) 
    y_val_predicted_labels_mybag = Logistic_MYBAG.predict(X_val_mybag)
    #Fourth, measuring accuracy, f1, etc. of predictions
    accuracy_mybag=accuracy_score(y_val, y_val_predicted_labels_mybag)
    f1_mybag=f1_score(y_val, y_val_predicted_labels_mybag, average='weighted')
    precision_mybag=average_precision_score(y_val, y_val_predicted_labels_mybag, average='weighted')
    print('-----')
    print('l1 c=50')
    print(accuracy_mybag)
    print(f1_mybag)
    print(precision_mybag)
    
    Logistic_MYBAG = OneVsRestClassifier(LogisticRegression(penalty='l2', C=10,max_iter=500 ))
    Logistic_MYBAG.fit(X_train_mybag,y_train) 
    y_val_predicted_labels_mybag = Logistic_MYBAG.predict(X_val_mybag)
    #Fourth, measuring accuracy, f1, etc. of predictions
    accuracy_mybag=accuracy_score(y_val, y_val_predicted_labels_mybag)
    f1_mybag=f1_score(y_val, y_val_predicted_labels_mybag, average='weighted')
    precision_mybag=average_precision_score(y_val, y_val_predicted_labels_mybag, average='weighted')
    print('-----')
    print('l1 c=50')
    print(accuracy_mybag)
    print(f1_mybag)
    print(precision_mybag)
    
    
    Logistic_MYBAG = OneVsRestClassifier(LogisticRegression(penalty='l2', C=20,max_iter=500 ))
    Logistic_MYBAG.fit(X_train_mybag,y_train) 
    y_val_predicted_labels_mybag = Logistic_MYBAG.predict(X_val_mybag)
    #Fourth, measuring accuracy, f1, etc. of predictions
    accuracy_mybag=accuracy_score(y_val, y_val_predicted_labels_mybag)
    f1_mybag=f1_score(y_val, y_val_predicted_labels_mybag, average='weighted')
    precision_mybag=average_precision_score(y_val, y_val_predicted_labels_mybag, average='weighted')
    print('-----')
    print('l1 c=50')
    print(accuracy_mybag)
    print(f1_mybag)
    print(precision_mybag)
    
    
    Logistic_MYBAG = OneVsRestClassifier(LogisticRegression(penalty='l2', C=50,max_iter=500 ))
    Logistic_MYBAG.fit(X_train_mybag,y_train) 
    y_val_predicted_labels_mybag = Logistic_MYBAG.predict(X_val_mybag)
    #Fourth, measuring accuracy, f1, etc. of predictions
    accuracy_mybag=accuracy_score(y_val, y_val_predicted_labels_mybag)
    f1_mybag=f1_score(y_val, y_val_predicted_labels_mybag, average='weighted')
    precision_mybag=average_precision_score(y_val, y_val_predicted_labels_mybag, average='weighted')
    print('-----')
    print('l1 c=50')
    print(accuracy_mybag)
    print(f1_mybag)
    print(precision_mybag)
    
    
    Logistic_MYBAG = OneVsRestClassifier(LogisticRegression(penalty='l2', C=100,max_iter=500 ))
    Logistic_MYBAG.fit(X_train_mybag,y_train) 
    y_val_predicted_labels_mybag = Logistic_MYBAG.predict(X_val_mybag)
    #Fourth, measuring accuracy, f1, etc. of predictions
    accuracy_mybag=accuracy_score(y_val, y_val_predicted_labels_mybag)
    f1_mybag=f1_score(y_val, y_val_predicted_labels_mybag, average='weighted')
    precision_mybag=average_precision_score(y_val, y_val_predicted_labels_mybag, average='weighted')
    print('-----')
    print('l1 c=50')
    print(accuracy_mybag)
    print(f1_mybag)
    print(precision_mybag)