# ALL LIBRARY IMPORTS USED

In [1]:
##BASIC PREPROCESS RELATED LIBRARIES

import pandas as pd

import numpy as np

import os

import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn import preprocessing

from collections import Counter

import warnings
warnings.filterwarnings('ignore')

import io

import random
random.seed(123)


##WEB SCRAPING AND REGULAR EXPRESSIONS

import urllib.request as url

from bs4 import BeautifulSoup as bs

import re

import requests

##NLP LIBRARIES - NLTK USED IN BUILDING NLP PIPE LINE

import nltk
nltk.download('stopwords')

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize

from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()

from nltk.stem.porter import PorterStemmer

####### MODEL BUILDING MODULES ########

## Logistic regression

from sklearn.linear_model import LogisticRegression

##DECISION TREE

from sklearn.tree import DecisionTreeClassifier, export_graphviz, DecisionTreeRegressor

##CLASSIFICATION METRICS/REGRESSION METRICS

from sklearn.metrics import accuracy_score,mean_absolute_error

from sklearn.metrics import precision_recall_fscore_support as score

from sklearn.metrics import accuracy_score, f1_score


##RANDOM FOREST

from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier

#XGBOOST
#Convert data frames to matrix to process the data(Fast convergence)

import xgboost as xgb

#SVM
from sklearn.svm import SVR,SVC,LinearSVC

##SEARCHES AND TUNING

from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

#KNN
from sklearn.neighbors import KNeighborsClassifier


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# ***WEB SCRAPPING - EXTRACTING 10-K DOCUMENTS FROM US SECURITIES AND EXCHANGE COMISSION***

## ***COLLECTING EDGAR LINKS TO SCRAPE 10-K DOCUMENTS FROM TRAIN AND TEST CSV FILES PROVIDED***

In [4]:
##TRAIN CSV FILE

from google.colab import files
uploaded = files.upload()

train_links = pd.read_csv(io.BytesIO(uploaded['10k_filing_info_train_links-1570100710859.csv']))

Saving 10k_filing_info_train_links-1570100710859.csv to 10k_filing_info_train_links-1570100710859.csv


In [6]:
##TEST CSV FILE

from google.colab import files
uploaded = files.upload()

test_links = pd.read_csv(io.BytesIO(uploaded['10k_filing_info_train_links-1570100710859.csv']))

Saving 10k_filing_info_test_links-1570843502716.csv to 10k_filing_info_test_links-1570843502716.csv


## **FUNCTION THAT SCRAPES 10-K DOCUMENTS FROM THE GIVEN LINK**

In [0]:
def scrape_10k_document(link, FormType):
    
        
    html = url.urlopen(link).read()
    try:
        soup = bs(html, 'html.parser')
    except:
        soup = bs(html, 'lxml')
    
    
    for tag in soup.type.select('script'):
          tag.decompose()
    
    for tag in soup.type.select('style'):
          tag.decompose()
    
    
    type_tags = soup.findAll(re.compile('^type'))
  
    for tag in type_tags:
        text = tag.text
        if text.startswith(FormType):
            return text


## ***FUNCTION TO CALL SCRAPING FUNCTION AND RETURNS DATAFRAME OF SCRAPED CSV LINKS***

In [0]:
def scrape_function_caller(csv_links,FormType):
    
    data = {'ticker':[], 'cik':[], 'filing_date':[], '10k_link':[], 'scrapped_text':[],'long_term_outlook':[]} ## DICTIONERY THAT COLLECTS DATA FROM 10-K CSV FILES
    for rownum, row in csv_links.iterrows():

		print(row['10k_link'])
		print(rownum)

		ticker = row['ticker']
		cik = row['cik']
		filing_date = row['filing_date']
		DocLink = row['10k_link']
		outlook = row['long_term_outlook']
		text_data = get10kLink(DocLink,FormType)

		if text_data != "":


			data['ticker'].append(ticker)
			data['cik'].append(cik)
			data['filing_date'].append(filing_date)
			data['10k_link'].append(DocLink)
			data['description'].append(text_data)
			data['long_term_outlook'].append(outlook)

        elif text_data == "":

			
			data['ticker'].append(ticker)
			data['cik'].append(cik)
			data['filing_date'].append(filing_date)
			data['10k_link'].append(DocLink)
			data['description'].append("TO_BE_CHECKED")
			data['long_term_outlook'].append(outlook)
			
    return pd.DataFrame.from_dict(data)


		
	
      

## ***CODE BLOCK FOR CALLING SCRAPE_FUNCTION_CALLER***

In [0]:
FormType = '10-K'

data_scraped = scrape_function_caller(train_links,FormType)
test_scraped = scrape_function_caller(test_links,FormType)


##MOUNTING THE GOOGLE DRIVE AND STORING THE SCRAPPED FILES INTO IT

from google.colab import drive
drive.mount('/content/drive')

data_scraped.to_csv('data.csv')
!cp data.csv drive/My\ Drive/

test_scraped.to_csv('test.csv')
!cp test.csv drive/My\ Drive/

## ***IMPORTING CONTRACTION MAP FOR BETTER PROCESS TEXT***

In [7]:
from google.colab import files

src = list(files.upload().values())[0]
open('contractions.py','wb').write(src)
from contractions import CONTRACTION_MAP
import unicodedata

Saving contractions.py to contractions.py


Contractions are shortened version of words or syllables. They often exist in either written or spoken forms in the English language. These shortened versions or contractions of words are created by removing specific letters and sounds. In case of English contractions, they are often created by removing one of the vowels from the word. Examples would be, do not to don’t and I would to I’d. Converting each contraction to its expanded, original form helps with text standardization.

We leverage a standard set of contractions available in the contractions.py file

# ***TEXT CLEANING AND PROCESSING***

## ***TEXT CLEANING : REMOVAL OF SCRUB WORDS,ACCENTED CHARACTERS AND EXPANDING CONTRACTIONS***

In [0]:
##TEXT CLEANING
##SCRUB WORD FUNCTION REMOVES HTML MARK UPS,SPECIAL CHARACTERS,NON ASCIII CHARACTERS,NON BREAKING SPACES ETC

def scrub_words(outtext):
    # Replace \xao characters in text -
    # \xa0 is actually non-breaking space in Latin1 (ISO 8859-1), also chr(160).
    outtext = re.sub('\xa0', ' ', outtext)

    # Replace non ascii and digits
    outtext = re.sub("(\\W|\\d)", ' ', outtext)

    # Replace new line characters and following text untill space
    outtext = re.sub('\n(\w*?)[\s]', '', outtext)

    # Remove html markup
    outtext = re.sub("<.*?>", ' ', outtext)
    
    #outtext = re.sub('\n',' ',outtext)
    #outtext = re.sub(r'\\xa0',' ',outtext)
    #outtext = re.sub(r'\xa0',' ',outtext)
    #outtext = re.sub(r'\\n',' ',outtext)
    outtext = re.sub(r'_',' ',outtext)
    #outtext = re.sub(r'  ',' ',outtext)
    #outtext = re.sub('[^a-zA-z\s]','',outtext)
    outtext = re.sub(' +', ' ',outtext)
    " ".join(outtext.strip())


    return outtext
  

##USING CONTRACTION MAP, EXPAND CONTRACTIONS FUNCTION EXTRACTS THE CONTRACTION FROM THE GIVEN WORD

def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    # Regular Expression Objects
    # If you want to use the same regular expression more than once,
    # you should compile it into a regular expression object.
    # Regular expression objects are more efficient, and make your code more readable.
    # To create one, just call re.compile(regex) or re.compile(regex, flags).
    # The flags are the matching options described above for the re.search() and re.match() functions.

    # The regular expression object returned by re.compile() provides all the functions that
    # the re module also provides directly: search(), match(), findall(), finditer(), sub() and split().
    # The difference is that they use the pattern stored in the regex object, and do not take the regex
    # as the first parameter. re.compile(regex).search(subject) is equivalent to re.search(regex, subject).
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                      flags=re.IGNORECASE | re.DOTALL)

    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
            if contraction_mapping.get(match)\
            else contraction_mapping.get(match.lower())
        expanded_contraction = first_char+expanded_contraction[1:]
        expanded_contraction = expanded_contraction[0:]
        return expanded_contraction

    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)

    return expanded_text

def remove_accented_chars(text):
    """Remove non-ASCII characters from list of tokenized words"""
    text = unicodedata.normalize('NFKD', text).encode(
        'ascii', 'ignore').decode('utf-8', 'ignore')
    return text

def text_cleaning(scrapped_raw_text):
    
    text_scrub_word_removal = scrub_words(scrapped_raw_text)
    text_expand_contarction_removal = expand_contractions(text_scrub_word_removal)
    cleaned_text = remove_accented_chars(text_expand_contarction_removal)
    
    return cleaned_text
  
  

## ***NLP PIPE LINE BUILDING (TOKENIZATION,CONVERING TO LOWER CASE,LEMMATIZATION)***

In [0]:
def text_processing(text):
  ##STOP WORDS REMOVAL
  
  ##tokens = [token for token in word_tokenize(text) if not token in stop_words]
  
  ##REMOVING ONE/TWO LETTER WORDS
  
  
  tokens = [token for token in word_tokenize(text) if len(token)>2]
  
  ##CONVERTING TO LOWER CASE
  
  tokens = [token.lower() for token in tokens]
  
  ##LEMMATIZING WORDS
  
  tokens = [wordnet_lemmatizer.lemmatize(token) for token in tokens]
  
  ##Returning in a string format
  
  text = ' '.join(tokens)
  
  return text
  
  

## ***APPLY TEXT CLEANING AND TEXT PROCESSING ON RAW DATA***

In [0]:
##TRAIN DATA

data_scraped['scrapped_text'] = data['description'].apply(text_cleaning)
data_scraped['scrapped_text'] = data['scrapped_text'].apply(text_processing)


##TEST DATA

test_scrapped['scrapped_text'] = test_scrapped['description'].apply(text_cleaning)
test_scrapped['scrapped_text'] = test_scrapped['scrapped_text'].apply(text_processing)


##MOUNTING THE GOOGLE DRIVE AND STORING THE CLEAN SCRAPPED FILES INTO IT

from google.colab import drive
drive.mount('/content/drive')

data_scraped.to_csv('data_13_clean.csv')
!cp data.csv drive/My\ Drive/

test_scraped.to_csv('test_13_clean.csv')
!cp test.csv drive/My\ Drive/


## ***LOADING TEXT PROCESSED TRAIN DATA***

In [10]:
from google.colab import drive
drive.mount('/content/drive')

data_13_clean = pd.read_csv('drive/My Drive/data_13_clean.csv')


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


## ***LOADING TEXT PROCESSED TEST DATA***

In [0]:
from google.colab import drive
drive.mount('/content/drive')

test_13_clean = pd.read_csv('drive/My Drive/test_13_clean_updated.csv')



## ***STOP WORDS***

In [13]:
##STOP WORDS : THESE ARE TAKEN FROM UNIVERSITY OF NOTRE DAME PAPER ON MCDONALD FINANCE DICTIONERY 

_stopwords = ['ME', 'MY', 'MYSELF', 'WE', 'OUR', 'OURS', 'OURSELVES', 'YOU', 'YOUR', 'YOURS',
                       'YOURSELF', 'YOURSELVES', 'HE', 'HIM', 'HIS', 'HIMSELF', 'SHE', 'HER', 'HERS', 'HERSELF',
                       'IT', 'ITS', 'ITSELF', 'THEY', 'THEM', 'THEIR', 'THEIRS', 'THEMSELVES', 'WHAT', 'WHICH',
                       'WHO', 'WHOM', 'THIS', 'THAT', 'THESE', 'THOSE', 'AM', 'IS', 'ARE', 'WAS', 'WERE', 'BE',
                       'BEEN', 'BEING', 'HAVE', 'HAS', 'HAD', 'HAVING', 'DO', 'DOES', 'DID', 'DOING', 'AN',
                       'THE', 'AND', 'BUT', 'IF', 'OR', 'BECAUSE', 'AS', 'UNTIL', 'WHILE', 'OF', 'AT', 'BY',
                       'FOR', 'WITH', 'ABOUT', 'BETWEEN', 'INTO', 'THROUGH', 'DURING', 'BEFORE',
                       'AFTER', 'ABOVE', 'BELOW', 'TO', 'FROM', 'UP', 'DOWN', 'IN', 'OUT', 'ON', 'OFF', 'OVER',
                       'UNDER', 'AGAIN', 'FURTHER', 'THEN', 'ONCE', 'HERE', 'THERE', 'WHEN', 'WHERE', 'WHY',
                       'HOW', 'ALL', 'ANY', 'BOTH', 'EACH', 'FEW', 'MORE', 'MOST', 'OTHER', 'SOME', 'SUCH',
                       'NO', 'NOR', 'NOT', 'ONLY', 'OWN', 'SAME', 'SO', 'THAN', 'TOO', 'VERY', 'CAN',
                       'JUST', 'SHOULD', 'NOW']

from google.colab import files
uploaded = files.upload()

import io


stop_words_1 = pd.read_csv(io.BytesIO(uploaded['Financial_stopwords.csv']))

a = stop_words_1['Stop_Words'].tolist()

print(len(_stopwords),'  ',len(a))

s = ' '.join(_stopwords)
_stopwords = s.lower().split()
_stopwords.extend(a)



Saving Financial_stopwords.csv to Financial_stopwords.csv
120    571


## ***LOADING TIM LOUGHRAN MCDONALD DICTIONERY WHICH IS VOCABULARY OUR CORPUS***

In [16]:
from google.colab import files
uploaded = files.upload()

LA_MCD = pd.read_csv(io.BytesIO(uploaded['LA_MCD.csv']))



Saving LA_MCD.csv to LA_MCD.csv


In [0]:
def convertToInt(cell):
    try:
        return int(cell)
    except:
        return None


In [0]:
def NO_STOP(word):
  
    if word.lower() in _stopwords:
      return None
    else:
      return word.lower()

In [0]:
LA_MCD['Sequence Number'] = LA_MCD['Sequence Number'].apply(convertToInt)

In [0]:
LA_MCD.dropna(inplace=True)

In [0]:
LA_MCD['Sequence Number'] = LA_MCD['Sequence Number'].astype('int64')

In [0]:
LA_MCD['Word'] = LA_MCD['Word'].apply(NO_STOP)

In [0]:
LA_MCD.dropna(inplace=True)

In [0]:
LA_MCD.reset_index(inplace=True)

In [25]:
LA_MCD.index

RangeIndex(start=0, stop=84747, step=1)

In [0]:
LA_MCD['NEW'] = LA_MCD.index

## **VOCABULARY BUILDING FROM LOUGHREN AND MCDONALD DICTIONERY**

In [27]:
##BUILDING DICTIONERY OF VOCABULARY OF MCDONALDS WORDS
##HERE KEY IS WORD AND VALUE IS SEQUENCE NUMBER OF WORD 

%%time

v={}
for i,j in LA_MCD.iterrows():
    
    v.update({j['Word'].lower():j['NEW']})

CPU times: user 8.36 s, sys: 38.4 ms, total: 8.4 s
Wall time: 8.4 s


## ***TRAIN,VALIDATION SPLITS***

In [0]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(data_13_clean['scrapped_text'],data_13_clean['long_term_outlook'], test_size=0.25, random_state=1234)

# ***BUILDING TFIDF MATRICES FOR TRAIN, VALIDATION AND TEST***

## ***HERE TFIDG IS BUILT ON THE CORPUS FOR THE WORDS TAKEN FROM VOCABULARY OF MCD DICTIONERY(STOPWORDS REMOVED FROM VOCABBULARY)***

In [29]:
%%time

from sklearn.feature_extraction.text import TfidfVectorizer

# define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8,
                                   min_df=0.2,
                                   use_idf=True, ngram_range=(1, 4),stop_words = _stopwords,vocabulary = v)


CPU times: user 205 µs, sys: 0 ns, total: 205 µs
Wall time: 208 µs


In [30]:

%time tfidf_matrix_train = tfidf_vectorizer.fit_transform(X_train)

CPU times: user 2min 9s, sys: 7.52 s, total: 2min 17s
Wall time: 2min 17s


## ***top_tfidf_feats FUNCTION RETURN TOP N TFIDF VALUES IN ROW AND RETURN THEM WITH THEIR CORRESPONDING FEATURE NAMES***

In [0]:
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

In [0]:
def top_feats_in_doc(Xtr, features, row_id, top_n=25):
                                                  
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

In [33]:
top_feats_in_doc(tfidf_matrix_train,tfidf_vectorizer.get_feature_names(),10,5)

Unnamed: 0,feature,tfidf
0,million,0.262485
1,girl,0.252091
2,product,0.223856
3,toy,0.220674
4,fisher,0.210615


In [34]:
%%time

tfidf_matrix_val = tfidf_vectorizer.transform(X_test)

tfidf_matrix_test = tfidf_vectorizer.transform(test_13_clean['scrapped_text'])

CPU times: user 1min 36s, sys: 3.66 s, total: 1min 39s
Wall time: 1min 39s


In [35]:
print('TFIDF TRAIN : {0},TFIDF VAL : {1},TFIDF TEST : {2}'.format(tfidf_matrix_train.shape,tfidf_matrix_val.shape,tfidf_matrix_test.shape))

TFIDF TRAIN : (1926, 84747),TFIDF VAL : (642, 84747),TFIDF TEST : (856, 84747)


## ***USING SMOTE FOR OVER SAMPLING DATA AS WE HAVE CLASS IMBALANCE IN THE DATA PROVIDED***

In [0]:
from imblearn.over_sampling import SMOTE
smt = SMOTE()
tfidf_matrix_train, y_train = smt.fit_sample(tfidf_matrix_train, y_train)

In [37]:
y_train.shape

(2742,)

In [38]:
tfidf_matrix_train.shape

(2742, 84747)

In [39]:
print("balance class\n {}".format(pd.Series(y_train).value_counts()))

balance class
 1    1371
0    1371
dtype: int64


# **MODEL BUILDING**

## ***LOGISTIC REGRESSION FOR MULTIPLE C VALUES***

In [40]:
%%time

for c in [0.2, 0.5, 1,1.2,1.5,1.8,2,2.3,2.6,2.9,3,4,4.5]:
    
    lr = LogisticRegression(C=c)
    lr.fit(tfidf_matrix_train,y_train)
    print ("f1_score for C=%s: %s" 
           % (c, (f1_score(y_test, lr.predict(tfidf_matrix_val),pos_label=1)+f1_score(y_test, lr.predict(tfidf_matrix_val),pos_label=0))/2))
    
    print(f1_score(y_test, lr.predict(tfidf_matrix_val),pos_label=1),'  ',f1_score(y_test, lr.predict(tfidf_matrix_val),pos_label=0))

f1_score for C=0.2: 0.512584241617439
0.6386138613861385    0.38655462184873945
f1_score for C=0.5: 0.5042471042471042
0.657142857142857    0.3513513513513513
f1_score for C=1: 0.49999455367354717
0.6651053864168618    0.3348837209302325
f1_score for C=1.2: 0.5117173664137857
0.6782810685249709    0.34515366430260047
f1_score for C=1.5: 0.5175844668714349
0.6813953488372093    0.3537735849056604
f1_score for C=1.8: 0.5128929745659274
0.679814385150812    0.3459715639810427
f1_score for C=2: 0.5199524966736675
0.6844547563805103    0.35545023696682465
f1_score for C=2.3: 0.518730096194885
0.6806526806526807    0.3568075117370892
f1_score for C=2.6: 0.521105748135147
0.6837209302325581    0.3584905660377358
f1_score for C=2.9: 0.5234349106348011
0.6845168800931315    0.3623529411764706
f1_score for C=3: 0.524560104279675
0.6837806301050176    0.3653395784543325
f1_score for C=4: 0.5376771007215088
0.6982658959537573    0.37708830548926014
f1_score for C=4.5: 0.5400828126426195
0.70126874

In [41]:
%%time

for c in [4.5,4.8,5,5.2,5.4,5.6,5.8,6]:
    
    lr = LogisticRegression(C=c)
    lr.fit(tfidf_matrix_train,y_train)
    print ("f1_score for C=%s: %s" 
           % (c, (f1_score(y_test, lr.predict(tfidf_matrix_val),pos_label=1)+f1_score(y_test, lr.predict(tfidf_matrix_val),pos_label=0))/2))
    
    print(f1_score(y_test, lr.predict(tfidf_matrix_val),pos_label=1),'  ',f1_score(y_test, lr.predict(tfidf_matrix_val),pos_label=0))

f1_score for C=4.5: 0.5400828126426195
0.7012687427912342    0.3788968824940048
f1_score for C=4.8: 0.5448422545196738
0.7050691244239632    0.3846153846153846
f1_score for C=5: 0.5448422545196738
0.7050691244239632    0.3846153846153846
f1_score for C=5.2: 0.5448422545196738
0.7050691244239632    0.3846153846153846
f1_score for C=5.4: 0.5412863346331088
0.7027649769585254    0.37980769230769235
f1_score for C=5.6: 0.5424903295575859
0.7042577675489068    0.380722891566265
f1_score for C=5.8: 0.5436948192570381
0.7057471264367815    0.3816425120772947
f1_score for C=6: 0.5461053709806716
0.7087155963302753    0.38349514563106796
CPU times: user 2min 43s, sys: 12min 55s, total: 15min 38s
Wall time: 24.1 s


In [42]:
%%time

for c in [6.2,6.4,6.8,7,7.2,7.4,7.6,7.8,8]:
    
    lr = LogisticRegression(C=c)
    lr.fit(tfidf_matrix_train,y_train)
    print ("f1_score for C=%s: %s" 
           % (c, (f1_score(y_test, lr.predict(tfidf_matrix_val),pos_label=1)+f1_score(y_test, lr.predict(tfidf_matrix_val),pos_label=0))/2))
    
    print(f1_score(y_test, lr.predict(tfidf_matrix_val),pos_label=1),'  ',f1_score(y_test, lr.predict(tfidf_matrix_val),pos_label=0))

f1_score for C=6.2: 0.5449349779538984
0.7093821510297482    0.38048780487804873
f1_score for C=6.4: 0.5449349779538984
0.7093821510297482    0.38048780487804873
f1_score for C=6.8: 0.5437329119321744
0.7079037800687284    0.3795620437956204
f1_score for C=7: 0.5425313975238265
0.7064220183486238    0.3786407766990292
f1_score for C=7.2: 0.5389574240669813
0.7041284403669724    0.3737864077669903
f1_score for C=7.4: 0.5401299350324837
0.7034482758620689    0.37681159420289856
f1_score for C=7.6: 0.5389574240669813
0.7041284403669724    0.3737864077669903
f1_score for C=7.8: 0.5389574240669813
0.7041284403669724    0.3737864077669903
f1_score for C=8: 0.537760999435677
0.7026406429391505    0.37288135593220345
CPU times: user 3min 7s, sys: 14min 56s, total: 18min 3s
Wall time: 27.8 s


In [43]:
%%time

for c in [8.2,8.5,8.8,9,9.3,9.6,9.9,10.2,10.5,11,11.2,11.4,11.6,11.8,12,12.3,12.6,12.9]:
    
    lr = LogisticRegression(C=c)
    lr.fit(tfidf_matrix_train,y_train)
    print ("f1_score for C=%s: %s" 
           % (c, (f1_score(y_test, lr.predict(tfidf_matrix_val),pos_label=1)+f1_score(y_test, lr.predict(tfidf_matrix_val),pos_label=0))/2))
    
    print(f1_score(y_test, lr.predict(tfidf_matrix_val),pos_label=1),'  ',f1_score(y_test, lr.predict(tfidf_matrix_val),pos_label=0))

f1_score for C=8.2: 0.537760999435677
0.7026406429391505    0.37288135593220345
f1_score for C=8.5: 0.5353834506101363
0.7018348623853211    0.3689320388349515
f1_score for C=8.8: 0.5365757811389537
0.7033218785796105    0.36982968369829683
f1_score for C=9: 0.5317862382116662
0.7017142857142858    0.36185819070904646
f1_score for C=9.3: 0.5317862382116662
0.7017142857142858    0.36185819070904646
f1_score for C=9.6: 0.5353411806950414
0.7061503416856493    0.36453201970443355
f1_score for C=9.9: 0.5341021602160216
0.7068181818181819    0.3613861386138614
f1_score for C=10.2: 0.5352844585022096
0.7082860385925086    0.36228287841191065
f1_score for C=10.5: 0.5376507767952712
0.7112117780294451    0.3640897755610973
f1_score for C=11: 0.5364673232477071
0.7097505668934241    0.36318407960198995
f1_score for C=11.2: 0.5376507767952712
0.7112117780294451    0.3640897755610973
f1_score for C=11.4: 0.5340245083779792
0.7089467723669309    0.35910224438902744
f1_score for C=11.6: 0.534024508

In [44]:
%%time

for c in [16.4,16.6,16.8,17,17.2,17.4,17.6]:
    
    lr = LogisticRegression(C=c)
    lr.fit(tfidf_matrix_train,y_train)
    
    print ("f1_score for C=%s: %s" 
           % (c, (f1_score(y_test, lr.predict(tfidf_matrix_val),pos_label=1)+f1_score(y_test, lr.predict(tfidf_matrix_val),pos_label=0))/2))
    
    print(f1_score(y_test, lr.predict(tfidf_matrix_val),pos_label=1),'  ',f1_score(y_test, lr.predict(tfidf_matrix_val),pos_label=0))

f1_score for C=16.4: 0.5327471220423942
0.7096045197740114    0.355889724310777
f1_score for C=16.6: 0.5327471220423942
0.7096045197740114    0.355889724310777
f1_score for C=16.8: 0.5315723981900453
0.7081447963800906    0.3550000000000001
f1_score for C=17: 0.5315723981900453
0.7081447963800906    0.3550000000000001
f1_score for C=17.2: 0.5303982399606872
0.7066817667044168    0.3541147132169577
f1_score for C=17.4: 0.5303982399606872
0.7066817667044168    0.3541147132169577
f1_score for C=17.6: 0.5328459742105797
0.7074829931972788    0.3582089552238806
CPU times: user 3min 3s, sys: 14min 36s, total: 17min 40s
Wall time: 27.1 s


In [45]:
%%time

for c in [17.6,17.8,18,18.2,18.6]:
    
    lr = LogisticRegression(C=c)
    lr.fit(tfidf_matrix_train,y_train)
    
    print ("f1_score for C=%s: %s" 
           % (c, (f1_score(y_test, lr.predict(tfidf_matrix_val),pos_label=1)+f1_score(y_test, lr.predict(tfidf_matrix_val),pos_label=0))/2))
    
    print(f1_score(y_test, lr.predict(tfidf_matrix_val),pos_label=1),'  ',f1_score(y_test, lr.predict(tfidf_matrix_val),pos_label=0))

f1_score for C=17.6: 0.5328459742105797
0.7074829931972788    0.3582089552238806
f1_score for C=17.8: 0.5328459742105797
0.7074829931972788    0.3582089552238806
f1_score for C=18: 0.5316679951442502
0.7060158910329172    0.35732009925558317
f1_score for C=18.2: 0.5316679951442502
0.7060158910329172    0.35732009925558317
f1_score for C=18.6: 0.5316679951442502
0.7060158910329172    0.35732009925558317
CPU times: user 2min 16s, sys: 10min 55s, total: 13min 11s
Wall time: 20.2 s


In [156]:
lr = LogisticRegression(C=16.8)
lr.fit(tfidf_matrix_train,y_train)
print ("f1_score for C=%s: %s" 
           % (11.8, (f1_score(y_test, lr.predict(tfidf_matrix_val),pos_label=1)+f1_score(y_test, lr.predict(tfidf_matrix_val),pos_label=0))/2))
    
print(f1_score(y_test, lr.predict(tfidf_matrix_val),pos_label=1),'  ',f1_score(y_test, lr.predict(tfidf_matrix_val),pos_label=0))

f1_score for C=11.8: 0.5508804258804259
0.722972972972973    0.3787878787878788


In [56]:
tfidf_matrix_val.shape

(642, 84747)

In [158]:
lr = LogisticRegression(C=16.8)
lr.fit(tfidf_matrix_train,y_train)

y_TEST_LR_96 = lr.predict(tfidf_matrix_test)
y_TEST_LR_96
y_TEST_LR_96_file = pd.DataFrame(y_TEST_LR_96,columns=['long_term_outlook'])
y_TEST_LR_96_file.set_index(test_13_clean['Unnamed: 0'])
y_TEST_LR_96_file.shape

(856, 1)

In [0]:
from google.colab import files


y_TEST_LR_96_file.to_csv('y_TEST_LR_96_file.csv')
files.download('y_TEST_LR_96_file.csv')

# ***LINEAR SVM AND RBF KERNAL SVM***

In [46]:
from sklearn.svm import LinearSVC,SVC

svc1 = SVC(class_weight='balanced',kernel='rbf')
svc1.fit(tfidf_matrix_train,y_train)
print ("f1_score for C=%s: %s" 
           % (c, (f1_score(y_test, svc1.predict(tfidf_matrix_val),pos_label=1)+f1_score(y_test, svc1.predict(tfidf_matrix_val),pos_label=0))/2))
    
print(f1_score(y_test, svc1.predict(tfidf_matrix_val),pos_label=1),'  ',f1_score(y_test, svc1.predict(tfidf_matrix_val),pos_label=0))

f1_score for C=18.6: 0.4768904016071136
0.5249643366619117    0.4288164665523156


In [140]:

from sklearn.svm import LinearSVC,SVC

for c in [0.02,0.05,0.08,0.2,0.4,0.8,1, 2, 3, 3.5,3.8,4,4.2,4.5]:
    
    svm = LinearSVC(C=c)
    svm.fit(tfidf_matrix_train, y_train)
    print ("f1_score for C=%s: %s" 
           % (c, (f1_score(y_test, lr.predict(tfidf_matrix_val),pos_label=1)+f1_score(y_test, lr.predict(tfidf_matrix_val),pos_label=0))/2))
    
    print(f1_score(y_test, lr.predict(tfidf_matrix_val),pos_label=1),'  ',f1_score(y_test, lr.predict(tfidf_matrix_val),pos_label=0))

f1_score for C=0.02: 0.546037218257563
0.7192784667418264    0.37279596977329976
f1_score for C=0.05: 0.546037218257563
0.7192784667418264    0.37279596977329976
f1_score for C=0.08: 0.546037218257563
0.7192784667418264    0.37279596977329976
f1_score for C=0.2: 0.546037218257563
0.7192784667418264    0.37279596977329976
f1_score for C=0.4: 0.546037218257563
0.7192784667418264    0.37279596977329976
f1_score for C=0.8: 0.546037218257563
0.7192784667418264    0.37279596977329976
f1_score for C=1: 0.546037218257563
0.7192784667418264    0.37279596977329976
f1_score for C=2: 0.546037218257563
0.7192784667418264    0.37279596977329976
f1_score for C=3: 0.546037218257563
0.7192784667418264    0.37279596977329976
f1_score for C=3.5: 0.546037218257563
0.7192784667418264    0.37279596977329976
f1_score for C=3.8: 0.546037218257563
0.7192784667418264    0.37279596977329976
f1_score for C=4: 0.546037218257563
0.7192784667418264    0.37279596977329976
f1_score for C=4.2: 0.546037218257563
0.71927

# ***DECISION TREE***

In [143]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import KFold, learning_curve
from sklearn.metrics import f1_score
from sklearn import preprocessing
import warnings

from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

dt = DecisionTreeClassifier(splitter = 'random')

dt.fit(tfidf_matrix_train, y_train)
print ("f1_score for C=%s: %s" 
           % (c, (f1_score(y_test, lr.predict(tfidf_matrix_val),pos_label=1)+f1_score(y_test, lr.predict(tfidf_matrix_val),pos_label=0))/2))
    
print(f1_score(y_test, lr.predict(tfidf_matrix_val),pos_label=1),'  ',f1_score(y_test, lr.predict(tfidf_matrix_val),pos_label=0))

f1_score for C=4.5: 0.546037218257563
0.7192784667418264    0.37279596977329976


# ***KNN***

In [146]:
##KNN

from sklearn.neighbors import KNeighborsRegressor,KNeighborsClassifier

for k in [1,2,3,4,5,6,7,8,9,10]:

    KNN = KNeighborsClassifier(n_neighbors = k)
    KNN.fit(tfidf_matrix_train,y_train)



    print ("f1_score for C=%s: %s" 
               % (c, (f1_score(y_test, KNN.predict(tfidf_matrix_val),pos_label=1)+f1_score(y_test, KNN.predict(tfidf_matrix_val),pos_label=0))/2))

    print(f1_score(y_test, KNN.predict(tfidf_matrix_val),pos_label=1),'  ',f1_score(y_test, KNN.predict(tfidf_matrix_val),pos_label=0))


f1_score for C=4.5: 0.4912295227393695
0.7067833698030633    0.2756756756756757
f1_score for C=4.5: 0.4666333016365547
0.559681697612732    0.37358490566037733
f1_score for C=4.5: 0.4876867402802264
0.6369119420989143    0.3384615384615385
f1_score for C=4.5: 0.4701906992305118
0.546938775510204    0.3934426229508196
f1_score for C=4.5: 0.4898635579486643
0.6265356265356266    0.35319148936170214
f1_score for C=4.5: 0.46403315785338256
0.5224719101123595    0.4055944055944056
f1_score for C=4.5: 0.4879238342969202
0.5964010282776349    0.37944664031620556
f1_score for C=4.5: 0.459488516284858
0.5175808720112518    0.4013961605584642
f1_score for C=4.5: 0.46308243727598564
0.5483870967741935    0.3777777777777777
f1_score for C=4.5: 0.4314271255060729
0.4615384615384615    0.40131578947368424


# ***RANDOM FOREST***

In [193]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(class_weight= {0 : 0.63,1: 0.37})

rfc.fit(tfidf_matrix_train,y_train)

print ("f1_score for C=%s: %s" 
               % (c, (f1_score(y_test, rfc.predict(tfidf_matrix_val),pos_label=1)+f1_score(y_test, rfc.predict(tfidf_matrix_val),pos_label=0))/2))

print(f1_score(y_test, rfc.predict(tfidf_matrix_val),pos_label=1),'  ',f1_score(y_test, rfc.predict(tfidf_matrix_val),pos_label=0))


f1_score for C=4.5: 0.5295590662035381
0.7171492204899778    0.34196891191709844


# ***GRADIENT BOOSTING***

In [194]:
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import GridSearchCV

# Model in use
GBM = GradientBoostingClassifier() 
 
# Use a grid over parameters of interest


GBM.fit(tfidf_matrix_train,y_train)

print ("f1_score for C=%s: %s" 
               % (c, (f1_score(y_test, GBM.predict(tfidf_matrix_val),pos_label=1)+f1_score(y_test, GBM.predict(tfidf_matrix_val),pos_label=0))/2))

print(f1_score(y_test, GBM.predict(tfidf_matrix_val),pos_label=1),'  ',f1_score(y_test, GBM.predict(tfidf_matrix_val),pos_label=0))


f1_score for C=4.5: 0.5432683093215521
0.7730569948186529    0.31347962382445144


# ***NAIVE BAYES***

In [195]:
from sklearn.naive_bayes import BernoulliNB

#Create a Gaussian Classifier
model = BernoulliNB(0.01)

# Train the model using the training sets
model.fit(tfidf_matrix_train,y_train)

#Predict Output
print ("f1_score for C=%s: %s" 
               % (c, (f1_score(y_test, model.predict(tfidf_matrix_val),pos_label=1)+f1_score(y_test, model.predict(tfidf_matrix_val),pos_label=0))/2))

print(f1_score(y_test, model.predict(tfidf_matrix_val),pos_label=1),'  ',f1_score(y_test, model.predict(tfidf_matrix_val),pos_label=0))


f1_score for C=4.5: 0.507296047098402
0.6439024390243903    0.3706896551724138


# ***ADABOOST***

In [196]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=4),n_estimators=100,learning_rate=0.1)

ada.fit(tfidf_matrix_train,y_train)


print ("f1_score for C=%s: %s" 
               % (c, (f1_score(y_test, ada.predict(tfidf_matrix_val),pos_label=1)+f1_score(y_test, ada.predict(tfidf_matrix_val),pos_label=0))/2))

print(f1_score(y_test, ada.predict(tfidf_matrix_val),pos_label=1),'  ',f1_score(y_test, ada.predict(tfidf_matrix_val),pos_label=0))



f1_score for C=4.5: 0.503106211540216
0.7113062568605928    0.29490616621983917


# ***XGBOOST***

In [78]:
%%time

import xgboost as xgb
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

#tfidf_matrix_train_xgb = tfidf_matrix_train.as_matrix()
#y_train_xgb  = y_train.as_matrix()
#tfidf_matrix_val_xgb = tfidf_matrix_val.as_matrix()
#y_test_xgb  = y_test.as_matrix()

#tfidf_matrix_test_xgb = tfidf_matrix_test.as_matrix()


# parameters = {'objective':['reg:linear'],
#               'learning_rate': [.03, 0.05, .07 , 0.1 ,0.15,0.2,0.22,0.24,0.28], #so called `eta` value
#               'max_depth': [4,5,6,7],
#               'min_child_weight': [3,4,5,6],
#               'silent': [1],
#               'subsample': [0.75],
#               'colsample_bytree': [0.75],
#               'n_estimators': [50,100,150]}

# xgb_random = RandomizedSearchCV(xgb.XGBRFClassifier(),param_distributions=parameters,cv=10)
# xgb_random.fit(tfidf_matrix_train,y_train)

xgb_params = {'learning_rate': 0.05, 
              'max_depth': 4,
              'subsample': 0.9,        
              'colsample_bytree': 0.9,
              'objective': 'binary:logistic',
              'silent': 1, 
              'n_estimators':100, 
              'gamma':1,         
              'min_child_weight':4}   
xgb1 = xgb.XGBClassifier(**xgb_params, seed = 10,scale_pos_weight=0.65)

xgb1.fit(tfidf_matrix_train,y_train)


print ("f1_score is %s" 
               % ((f1_score(y_test, xgb1.predict(tfidf_matrix_val),pos_label=1)+f1_score(y_test, xgb1.predict(tfidf_matrix_val),pos_label=0))/2))

print(f1_score(y_test, xgb1.predict(tfidf_matrix_val),pos_label=1),'  ',f1_score(y_test, xgb1.predict(tfidf_matrix_val),pos_label=0))


f1_score is 0.5475157849791923
0.7342888643880927    0.36074270557029176


# ***STACKING CLASSIFIER (LOGISTIC REGRESSION,LINEAR SVM,DT,GRADIENT BOOSTING AND XGBOOST COMBINED)***

In [208]:
%%time

sclf = StackingClassifier(classifiers=[clf1, clf2, clf3,GBM,xgb1], 
                          meta_classifier=lr)
sclf.fit(tfidf_matrix_train,y_train)

print ("f1_score for C=%s: %s" 
               % (c, (f1_score(y_test, sclf.predict(tfidf_matrix_val),pos_label=1)+f1_score(y_test, sclf.predict(tfidf_matrix_val),pos_label=0))/2))

print(f1_score(y_test, sclf.predict(tfidf_matrix_val),pos_label=1),'  ',f1_score(y_test, sclf.predict(tfidf_matrix_val),pos_label=0))


f1_score for C=4.5: 0.5525704184276882
0.747014115092291    0.3581267217630854
CPU times: user 5min 42s, sys: 3min 14s, total: 8min 57s
Wall time: 5min 7s


## **PREDICTION OUTPUT TO CSV FILE**

In [0]:
y_TEST_LR_sclf_2 = sclf.predict(tfidf_matrix_test)
y_TEST_LR_sclf_2
y_TEST_LR_sclf_file_2 = pd.DataFrame(y_TEST_LR_sclf_2,columns=['long_term_outlook'])
y_TEST_LR_sclf_file_2.set_index(test_13_clean['Unnamed: 0'])
y_TEST_LR_sclf_file_2.shape


from google.colab import files


y_TEST_LR_sclf_file_2.to_csv('y_TEST_LR_sclf_file_2.csv')
files.download('y_TEST_LR_sclf_file_2.csv')

## **CHECKING FOR MODEL STABILITY**

In [50]:
# %%time

# from sklearn.ensemble import GradientBoostingClassifier

# GBM = GradientBoostingClassifier() 
 
# GBM.fit(tfidf_matrix_train,y_train)


# xgb_params = {'learning_rate': 0.05, 
#               'max_depth': 4,
#               'subsample': 0.9,        
#               'colsample_bytree': 0.9,
#               'objective': 'binary:logistic',
#               'silent': 1, 
#               'n_estimators':100, 
#               'gamma':1,         
#               'min_child_weight':4}   
# xgb1 = xgb.XGBClassifier(**xgb_params, seed = 10,scale_pos_weight=0.65)

# xgb1.fit(tfidf_matrix_train,y_train)

# clf1 = LogisticRegression(C=5)
# clf1.fit(tfidf_matrix_train,y_train)

# clf2 = DecisionTreeClassifier(splitter = 'random')
# clf2.fit(tfidf_matrix_train,y_train)

# clf3 = LinearSVC(C=1)
# clf3.fit(tfidf_matrix_train,y_train)

# lr = LogisticRegression()
# lr.fit(tfidf_matrix_train,y_train)


# from mlxtend.classifier import StackingClassifier
# from sklearn import model_selection


sclf = StackingClassifier(classifiers=[clf1, clf2, clf3,xgb1,GBM], 
                          meta_classifier=lr)



print('5-fold cross validation:\n')

for clf, label in zip([clf1, clf2, clf3, xgb1, GBM, sclf], 
                      ['Logistic', 
                       'Linear SVC', 
                       'Decision Tree','XGBOOST','Gradient Boosting'
                       'StackingClassifier']):

    scores = model_selection.cross_val_score(clf, tfidf_matrix_train, y_train, 
                                              cv=5, scoring='f1')
    print("F1 SCORE: %0.2f (+/- %0.2f) [%s]" 
          % (scores.mean(), scores.std(), label))

5-fold cross validation:

F1 SCORE: 0.67 (+/- 0.02) [Logistic]
F1 SCORE: 0.66 (+/- 0.05) [Linear SVC]
F1 SCORE: 0.68 (+/- 0.02) [Decision Tree]
F1 SCORE: 0.70 (+/- 0.02) [XGBOOST]
F1 SCORE: 0.78 (+/- 0.07) [Gradient BoostingStackingClassifier]


### ***For all the models cross validation accuracy deviation is not much based on that we can conclude they dont have high variance problem***

In [0]:
from sklearn.model_selection import train_test_split 
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(data_13_clean['scrapped_text'],data_13_clean['long_term_outlook'], test_size=0.25, random_state=1234)

# NGRAMS

## SVM LINEAR

In [84]:
%%time

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC


ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1,4), stop_words=_stopwords,max_df = 0.8 , min_df = 0.2, vocabulary = v)
ngram_vectorizer.fit(X_train_1)
X = ngram_vectorizer.transform(X_train_1)
X_val_ngram = ngram_vectorizer.transform(X_test_1)




CPU times: user 5min 8s, sys: 8.81 s, total: 5min 17s
Wall time: 5min 17s


In [0]:
test_X = ngram_vectorizer.transform(test_13_clean['scrapped_text'])

In [93]:
X.shape

(2742, 84747)

In [0]:
from imblearn.over_sampling import SMOTE
smt = SMOTE()
X, y_train_1 = smt.fit_sample(X, y_train_1)

In [88]:
y_train_1.shape

(2742,)

In [90]:
X.shape

(2742, 84747)

In [91]:
for c in [0.001, 0.005, 0.01, 0.05, 0.1, 1.5, 2]:
    
    lr = LinearSVC(C=c)
    lr.fit(X, y_train_1)
    print ("f1_score for C=%s: %s" 
           % (c, (f1_score(y_test_1, lr.predict(X_val_ngram),pos_label=1)+f1_score(y_test_1, lr.predict(X_val_ngram),pos_label=0))/2))
    
    print(f1_score(y_test_1, lr.predict(X_val_ngram),pos_label=1),'  ',f1_score(y_test_1, lr.predict(X_val_ngram),pos_label=0))


# for c in [1, 2, 3, 3.5,3.8,4,4.2,4.5]:
    
#     svm = LinearSVC(C=c)
#     svm.fit(X, y_train_1)
#     print ("f1_score for C=%s: %s" 
#            % (c, (f1_score(y_test_1, lr.predict(X_val_ngram),pos_label=1)+f1_score(y_test_1, lr.predict(X_val_ngram),pos_label=0))/2))
    
#     print(f1_score(y_test_1, lr.predict(X_val_ngram),pos_label=1),'  ',f1_score(y_test_1, lr.predict(X_val_ngram),pos_label=0))

    

f1_score for C=0.001: 0.5183448818123741
0.7636738906088752    0.27301587301587305
f1_score for C=0.005: 0.5203938161212766
0.7631851085832471    0.277602523659306
f1_score for C=0.01: 0.5159380765220181
0.7549530761209593    0.27692307692307694
f1_score for C=0.05: 0.5084413283385644
0.7457983193277311    0.27108433734939763
f1_score for C=0.1: 0.5071426557080144
0.7412882787750792    0.2729970326409496
f1_score for C=1.5: 0.4968835429196281
0.7225806451612902    0.2711864406779661
f1_score for C=2: 0.4958140663139223
0.721205597416577    0.2704225352112676


## ***LOGISTIC REGRESSION***

In [92]:
%%time

for c in [8.2,8.5,8.8,9,9.3,9.6,9.9,10.2,10.5,11,11.2,11.4,11.6,11.8,12,12.3,12.6,12.9]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X,y_train_1)
    print ("f1_score for C=%s: %s" 
           % (c, (f1_score(y_test_1, lr.predict(X_val_ngram),pos_label=1)+f1_score(y_test_1, lr.predict(X_val_ngram),pos_label=0))/2))
    
    print(f1_score(y_test_1, lr.predict(X_val_ngram),pos_label=1),'  ',f1_score(y_test_1, lr.predict(X_val_ngram),pos_label=0))

    

f1_score for C=8.2: 0.5133227435006213
0.7460484720758694    0.28059701492537314
f1_score for C=8.5: 0.5173615589072551
0.7481559536354057    0.2865671641791045
f1_score for C=8.8: 0.5203059071729959
0.748945147679325    0.29166666666666663
f1_score for C=9: 0.5203059071729959
0.748945147679325    0.29166666666666663
f1_score for C=9.3: 0.52323595674612
0.7497360084477297    0.29673590504451036
f1_score for C=9.6: 0.5192126314865937
0.7476240760295672    0.29080118694362017
f1_score for C=9.9: 0.5221361822435168
0.748414376321353    0.2958579881656805
f1_score for C=10.2: 0.5250456524792808
0.7492063492063492    0.3008849557522124
f1_score for C=10.5: 0.5203059071729959
0.748945147679325    0.29166666666666663
f1_score for C=11: 0.5192126314865937
0.7476240760295672    0.29080118694362017
f1_score for C=11.2: 0.5221361822435168
0.748414376321353    0.2958579881656805
f1_score for C=11.4: 0.5221361822435168
0.748414376321353    0.2958579881656805
f1_score for C=11.6: 0.5192126314865937


## ***RANDOM FOREST***

In [54]:
## Random Forest
%%time


from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(class_weight= 'balanced')


rfc.fit(X,y_train_1)

print ("f1_score for C=%s: %s" 
           % (c, (f1_score(y_test_1, lr.predict(X_val_ngram),pos_label=1)+f1_score(y_test_1, lr.predict(X_val_ngram),pos_label=0))/2))
    
print(f1_score(y_test_1, lr.predict(X_val_ngram),pos_label=1),'  ',f1_score(y_test_1, lr.predict(X_val_ngram),pos_label=0))

    

f1_score for C=12.9: 0.5184494169555626
0.7494736842105263    0.2874251497005988
CPU times: user 1.78 s, sys: 5.42 ms, total: 1.78 s
Wall time: 1.78 s
