In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Importing the dataset

## Dataset for training

In [4]:
dataset_train = pd.read_csv('twitter_training.csv')
dataset_train.columns = ['ID', 'Topic', 'Sentiment', 'Text']

In [5]:
dataset_train.head()

Unnamed: 0,ID,Topic,Sentiment,Text
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [6]:
X_train = dataset_train['Text']
y_train = dataset_train['Sentiment']

In [7]:
print(len(dataset_train))

74681


## Dataset for test

In [8]:
dataset_test = pd.read_csv('twitter_validation.csv')
dataset_test.columns = ['ID', 'Topic', 'Sentiment', 'Text']

In [9]:
dataset_test.head()

Unnamed: 0,ID,Topic,Sentiment,Text
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,4433,Google,Neutral,Now the President is slapping Americans in the...
4,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...


In [10]:
X_test = dataset_test['Text']
y_test = dataset_test['Sentiment']

# Cleaning the text

## For training

In [1]:
!pip install emoji

Collecting emoji
  Downloading emoji-2.11.1-py2.py3-none-any.whl (433 kB)
Installing collected packages: emoji
Successfully installed emoji-2.11.1


In [12]:
import re
import nltk
import emoji
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, len(dataset_train)):
  review = re.sub('[^a-zA-Z]', ' ', str(dataset_train['Text'][i]))
  review = emoji.replace_emoji(review,replace='')
  review = re.sub(r'[^a-zA-Z\s]','',review)
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

[nltk_data] Downloading package stopwords to C:\Users\Avani N.
[nltk_data]     Goswami\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
print(corpus)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



### Vectorize

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 5000)
X_train = cv.fit_transform(corpus).toarray()

### Encode the targets 

In [15]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)

In [16]:
print(y_train)

[3 3 3 ... 3 3 3]


In [17]:
len(X_train)

74681

In [18]:
len(y_train)

74681

## For testing

In [20]:
import re
import nltk
import emoji
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, len(dataset_test)):
  review = re.sub('[^a-zA-Z]', ' ', str(dataset_test['Text'][i]))
  review = emoji.replace_emoji(review,replace='')
  review = re.sub(r'[^a-zA-Z\s]','',review)
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

[nltk_data] Downloading package stopwords to C:\Users\Avani N.
[nltk_data]     Goswami\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
print(corpus)

['bbc news amazon boss jeff bezo reject claim compani act like drug dealer bbc co uk news av busin', 'microsoft pay word function poorli samsungu chromebook', 'csgo matchmak full closet hack truli aw game', 'presid slap american face realli commit unlaw act acquitt discov googl vanityfair com news', 'hi eahelp madelein mccann cellar past year littl sneaki thing escap whilst load fifa point took card use paypal account work help resolv pleas', 'thank eamaddennfl new te austin hooper orang brown brown austinhoop pic twitter com grg xzfkon', 'rocket leagu sea thiev rainbow six sieg love play three stream best stream twitch rocketleagu seaofthiev rainbowsixsieg follow', 'ass still knee deep assassin creed odyssey way anytim soon lmao', 'fix jesu pleas fix world go playstat askplayst playstationsup treyarch callofduti neg silver wolf error code pic twitter com ziryhrf q', 'profession dota scene fuck explod complet welcom get garbag', 'itch assassin tccgif assassinscreedblackflag assassinscr

### Vectorize

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

X_test = cv.transform(corpus).toarray()


### Encode the targets

In [23]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_test = le.fit_transform(y_test)

In [26]:
len(X_test)

999

In [27]:
len(y_test)

999

# Training the dataset

In [28]:
import lightgbm
train_data = lightgbm.Dataset(X_train, label=y_train)
valid_data = lightgbm.Dataset(X_test, label=y_test)

In [29]:
parameter = {'objective':'binary',
             'metric':'auc',
             'is_unbalance':'true',
             'boosting':'gbdt',
             'num_leaves':63,
             'feature_fraction':0.5,
             'bagging_fraction':0.5,
             'bagging_freq':20,
             'learning_rate':0.01,
             'verbose':-1
}

In [30]:
from lightgbm import LGBMClassifier

from sklearn.metrics import roc_auc_score

classes = np.unique(y_train)
model_lgbm = LGBMClassifier(colsample_bytree=0.8, learning_rate=0.01, max_depth=5,
               n_estimators=500, num_leaves=50)
model_lgbm.fit(X_train, y_train)

y_train_proba = model_lgbm.predict_proba(X_train)
y_valid_proba = model_lgbm.predict_proba(X_test)


# Should now be 2D: (n_samples, n_classes)
# print(y_train_proba.shape)
print(y_valid_proba)
roc_auc_train = roc_auc_score(y_train, y_train_proba, multi_class='ovo', labels=classes)
roc_auc_valid = roc_auc_score(y_test, y_valid_proba, multi_class='ovo', labels=classes)



__init__() got an unexpected keyword argument 'capture_output'
  "following reason:\n" + str(exception) + "\n"
  File "c:\users\avani n. goswami\appdata\local\programs\python\python36\lib\site-packages\joblib\externals\loky\backend\context.py", line 229, in _count_physical_cores
    capture_output=True)
  File "c:\users\avani n. goswami\appdata\local\programs\python\python36\lib\subprocess.py", line 403, in run
    with Popen(*popenargs, **kwargs) as process:


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.531653 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10174
[LightGBM] [Info] Number of data points in the train set: 74681, number of used features: 3797
[LightGBM] [Info] Start training from score -1.749046
[LightGBM] [Info] Start training from score -1.197845
[LightGBM] [Info] Start training from score -1.405342
[LightGBM] [Info] Start training from score -1.276783




























[[0.05052028 0.11823705 0.72300596 0.10823671]
 [0.10064691 0.33826148 0.25143985 0.30965177]
 [0.16472148 0.36068011 0.21201364 0.26258477]
 ...
 [0.11725808 0.3684553  0.21132966 0.30295697]
 [0.11286424 0.33036712 0.24950261 0.30726603]
 [0.02638252 0.15932906 0.74052938 0.07375904]]


In [31]:
print('training accuracy : ',roc_auc_train)

training accuracy :  0.786045785945951


In [32]:
print('testing accuracy : ',roc_auc_valid)

testing accuracy :  0.8162621690820037


# Saving the model

In [63]:
import pickle
with open('model.pkl', 'wb') as f:
    pickle.dump(model_lgbm,f)
with open('vectorize.pkl', 'wb') as f:
    pickle.dump(cv,f)