In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv("news.csv", 
                 encoding='cp437', 
                 header=None, 
                 names=["sentiment", "text"])

In [3]:
df

Unnamed: 0,sentiment,text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
...,...,...
4841,negative,LONDON MarketWatch -- Share prices ended lower...
4842,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4844,negative,Net sales of the Paper segment decreased to EU...


In [4]:
#sentiment to Y
le = LabelEncoder()
df["y"] = le.fit_transform(df["sentiment"])


In [5]:
df

Unnamed: 0,sentiment,text,y
0,neutral,"According to Gran , the company has no plans t...",1
1,neutral,Technopolis plans to develop in stages an area...,1
2,negative,The international electronic industry company ...,0
3,positive,With the new production plant the company woul...,2
4,positive,According to the company 's updated strategy f...,2
...,...,...,...
4841,negative,LONDON MarketWatch -- Share prices ended lower...,0
4842,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...,1
4843,negative,Operating profit fell to EUR 35.4 mn from EUR ...,0
4844,negative,Net sales of the Paper segment decreased to EU...,0


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split

In [17]:
#splitdata
train_df, test_df = train_test_split(df, stratify=df["y"], test_size=0.2)

In [18]:
train_df

Unnamed: 0,sentiment,text,y
1962,positive,"So far , Mr. Galvan he has been able to avoid ...",2
4311,neutral,Currently the quarterly applied surcharges dif...,1
3795,neutral,The company had net sales of EUR 19.8 mn and a...,1
3865,neutral,The most significant capital expenditure items...,1
2153,positive,"Net interest income totaled EUR 15.9 mn , comp...",2
...,...,...,...
4300,neutral,4 January 2011 - Finnish media company Alma Me...,1
3009,neutral,Like all other mechanical pipettors from Biohi...,1
3911,neutral,The tanks will be delivered to a company which...,1
505,positive,SysOpen Digia had signed an agreement with the...,2


In [19]:
#using stratify=df["y"] for the ratio of test and train is the same

In [20]:
test_df["sentiment"].value_counts() / len(test_df)

neutral     0.593814
positive    0.281443
negative    0.124742
Name: sentiment, dtype: float64

In [21]:
train_df["sentiment"].value_counts() / len(train_df)

neutral     0.594169
positive    0.281218
negative    0.124613
Name: sentiment, dtype: float64

In [22]:
#min_df word appear 2 time ==> count 
#Stop word (english): remove all stop word of English
tf_vectorizer = CountVectorizer(max_df=0.99, min_df=2,
                                lowercase=True,
                                stop_words='english')
train_tf = tf_vectorizer.fit_transform(train_df["text"].values)

In [23]:
tf_vectorizer.get_feature_names()

['00',
 '000',
 '01',
 '012',
 '02',
 '03',
 '04',
 '045',
 '05',
 '06',
 '07',
 '08',
 '09',
 '10',
 '100',
 '100mn',
 '101',
 '102',
 '1023',
 '103',
 '104',
 '105',
 '106',
 '10mn',
 '11',
 '112',
 '113',
 '114',
 '115',
 '116',
 '117',
 '118',
 '12',
 '120',
 '122',
 '129',
 '13',
 '130',
 '133',
 '134',
 '135',
 '14',
 '140',
 '142',
 '143',
 '145',
 '14mn',
 '15',
 '150',
 '150mn',
 '152',
 '155',
 '158',
 '159',
 '16',
 '160',
 '161',
 '164',
 '1649',
 '165',
 '17',
 '170',
 '18',
 '186',
 '187',
 '189',
 '19',
 '191',
 '1987',
 '1989',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '20',
 '200',
 '2000',
 '2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2007a',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '207',
 '20mn',
 '21',
 '211',
 '215',
 '22',
 '220',
 '23',
 '235',
 '24',
 '240',
 '241',
 '244',
 '246',
 '249',
 '25',
 '250',
 '253',
 '256',
 '259',
 '25mn',
 '26',
 '262',
 '27',
 '270',
 '275',
 '28',
 '29',
 '292',
 '

In [25]:
#Using logistic for training 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression(multi_class="multinomial")
model.fit(train_tf, train_df["y"])

LogisticRegression(multi_class='multinomial')

In [24]:
train_tf

<3876x3999 sparse matrix of type '<class 'numpy.int64'>'
	with 41525 stored elements in Compressed Sparse Row format>

In [26]:
#predict test set
test_tf = tf_vectorizer.transform(test_df["text"])
test_preds = model.predict(test_tf)
accuracy_score(test_df["y"], test_preds)

0.756701030927835

In [34]:
#using N-grams will increase vocalbulary size
#increase min_df to decrease vocalbulary size
tf_vectorizer = CountVectorizer(max_df=0.99, 
                                min_df=4,
                                lowercase=True,
                                stop_words='english', 
                                ngram_range=(1, 2) 
                               )

train_tf = tf_vectorizer.fit_transform(train_df["text"].values)

In [30]:
tf_vectorizer.get_feature_names()

['00',
 '00 eet',
 '000',
 '000 euro',
 '000 new',
 '000 people',
 '000 period',
 '000 quarter',
 '000 readers',
 '000 sq',
 '000 square',
 '000 tonnes',
 '000 tons',
 '000 usd',
 '01',
 '02',
 '03',
 '04',
 '05',
 '06',
 '07',
 '08',
 '09',
 '10',
 '10 000',
 '10 million',
 '10 mln',
 '10 pct',
 '10 percent',
 '10 year',
 '100',
 '100 000',
 '100 index',
 '100mn',
 '105',
 '11',
 '11 million',
 '11 mln',
 '11 mn',
 '12',
 '12 million',
 '12 mn',
 '12 month',
 '12 share',
 '120',
 '13',
 '13 mn',
 '130',
 '133',
 '14',
 '14 mn',
 '140',
 '15',
 '15 mln',
 '15 mn',
 '150',
 '152',
 '16',
 '16 mn',
 '17',
 '17 mn',
 '170',
 '18',
 '18 mn',
 '19',
 '1998',
 '1999',
 '20',
 '20 000',
 '20 countries',
 '20 mn',
 '200',
 '200 000',
 '2000',
 '2001',
 '2003',
 '2004',
 '2005',
 '2006',
 '2006 2007',
 '2007',
 '2007 eur',
 '2007 mln',
 '2008',
 '2008 compared',
 '2008 eur',
 '2008 finnish',
 '2009',
 '2009 eur',
 '2009 finnish',
 '2009 net',
 '2010',
 '2010 company',
 '2010 eur',
 '2010 finnis

In [35]:
train_tf

<3876x3053 sparse matrix of type '<class 'numpy.int64'>'
	with 45201 stored elements in Compressed Sparse Row format>

In [36]:
model = LogisticRegression(multi_class="multinomial")
model.fit(train_tf, train_df["y"])

LogisticRegression(multi_class='multinomial')

In [38]:
#predict test set
test_tf = tf_vectorizer.transform(test_df["text"])
test_preds = model.predict(test_tf)
accuracy_score(test_df["y"], test_preds)

0.7556701030927835

STEMMING WORD


In [40]:
#Stemming word using  nltk
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [42]:
from nltk.stem import PorterStemmer #stemming word
from nltk.tokenize import word_tokenize #split sentence to single word

In [44]:
stem = PorterStemmer()
df["text"][1]

'Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .'

In [46]:
" ".join([stem.stem(word) for word in word_tokenize(df["text"][1])])

'technopoli plan to develop in stage an area of no less than 100,000 squar meter in order to host compani work in comput technolog and telecommun , the statement said .'

In [47]:
def stem_sentence(text):
    return " ".join([stem.stem(word) for word in word_tokenize(text)])
train_df["stem_text"] = train_df["text"].map(stem_sentence)
test_df["stem_text"] = test_df["text"].map(stem_sentence)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [48]:
train_df["stem_text"][15]

'consolid net sale increas 16 % to reach eur74 .8 m , while oper profit amount to eur0 .9 m compar to a loss of eur0 .7 m in the prior year period .'

In [50]:
tf_vectorizer = CountVectorizer(max_df=0.99, 
                                min_df=4,
                                lowercase=True,
                                stop_words='english', 
                                ngram_range=(1, 2) 
                               )

train_tf = tf_vectorizer.fit_transform(train_df["stem_text"].values)
train_tf

<3876x2894 sparse matrix of type '<class 'numpy.int64'>'
	with 49061 stored elements in Compressed Sparse Row format>

In [51]:
tf_vectorizer.get_feature_names()

['00',
 '00 eet',
 '000',
 '000 euro',
 '000 new',
 '000 peopl',
 '000 period',
 '000 quarter',
 '000 reader',
 '000 sq',
 '000 squar',
 '000 ton',
 '000 tonn',
 '000 usd',
 '01',
 '02',
 '03',
 '04',
 '05',
 '06',
 '07',
 '08',
 '09',
 '10',
 '10 000',
 '10 million',
 '10 mln',
 '10 pct',
 '10 percent',
 '10 year',
 '100',
 '100 000',
 '100 index',
 '100mn',
 '105',
 '11',
 '11 million',
 '11 mln',
 '11 mn',
 '12',
 '12 million',
 '12 mn',
 '12 month',
 '12 share',
 '120',
 '13',
 '13 mn',
 '130',
 '133',
 '14',
 '14 mn',
 '140',
 '15',
 '15 mln',
 '15 mn',
 '150',
 '152',
 '16',
 '16 mn',
 '17',
 '17 mn',
 '170',
 '18',
 '18 mn',
 '19',
 '1998',
 '1999',
 '20',
 '20 000',
 '20 countri',
 '20 mn',
 '200',
 '200 000',
 '2000',
 '2001',
 '2003',
 '2004',
 '2005',
 '2006',
 '2006 2007',
 '2006 wa',
 '2007',
 '2007 eur',
 '2007 mln',
 '2007 wa',
 '2008',
 '2008 compar',
 '2008 eur',
 '2008 finnish',
 '2009',
 '2009 eur',
 '2009 finnish',
 '2009 net',
 '2009 wa',
 '2010',
 '2010 compani',


In [52]:
model = LogisticRegression(multi_class="multinomial", max_iter=500)
model.fit(train_tf, train_df["y"])

LogisticRegression(max_iter=500, multi_class='multinomial')

In [53]:

test_tf = tf_vectorizer.transform(test_df["stem_text"])
test_preds = model.predict(test_tf)
accuracy_score(test_df["y"], test_preds)

0.7587628865979381

In [55]:
#Checking word belong form class 
model.coef_

array([[-0.43926629, -0.08143564,  0.13811284, ...,  0.19034093,
        -0.02104707,  0.35051414],
       [ 0.21285894,  0.26714613,  0.08538006, ...,  0.19309647,
        -0.3370334 , -0.07076923],
       [ 0.22640735, -0.18571049, -0.2234929 , ..., -0.38343741,
         0.35808047, -0.27974491]])

In [60]:
check = (-model.coef_).argsort(axis=-1)[:,:5]

In [61]:
words = tf_vectorizer.get_feature_names()
for i, idx in enumerate(check):
    print(le.inverse_transform([i]))
    print([words[i] for i in idx])
    print("="*10)


['negative']
['decreas', 'drop', 'fell', 'declin', 'lower']
['neutral']
['publish', 'disclos', 'includ', 'period decreas', 'compani ad']
['positive']
['rose', 'increas', 'improv', 'grew', 'doubl']
