In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

In [2]:
data = pd.read_csv('training_data.txt', sep="\t", names=['Label','Text'])
test_data = pd.read_csv('test_data.txt', sep="\t", names=['Id','Text'])
print("Training data \n",data.head())
print("Test data \n",test_data.head())
print("Length of Training data \n", len(data))
print("Length of Test data \n", len(test_data))
data.groupby('Label').size()

Training data 
    Label                                               Text
0   True  YOU HAVE WON! As a valued Vodafone customer ou...
1  False  I‘ve got some salt, you can rub it in my open ...
2   True  Xmas & New Years Eve tickets are now on sale f...
3   True  3 FREE TAROT TEXTS! Find out about your love l...
4  False                    Like  &lt;#&gt; , same question
Test data 
    Id                                               Text
0   1  Designation is software developer and may be s...
1   2     How do you guys go to see movies on your side.
2   3  Urgh, coach hot, smells of chip fat! Thanks ag...
3   4                             R u in this continent?
4   5                    She's fine. Sends her greetings
Length of Training data 
 4343
Length of Test data 
 1114


Label
False    3761
True      582
dtype: int64

In [3]:
data.Label = data.Label.astype(int)
data_y = data['Label']
data_x = data['Text']
test = test_data['Text']
print(data_y.head())
print(data_x.head())

0    1
1    0
2    1
3    1
4    0
Name: Label, dtype: int32
0    YOU HAVE WON! As a valued Vodafone customer ou...
1    I‘ve got some salt, you can rub it in my open ...
2    Xmas & New Years Eve tickets are now on sale f...
3    3 FREE TAROT TEXTS! Find out about your love l...
4                      Like  &lt;#&gt; , same question
Name: Text, dtype: object


In [4]:
cv = CountVectorizer(min_df = 1 , stop_words = 'english')
x_traincv = cv.fit_transform(data_x)
test = cv.transform(test)
y_train = data_y.astype('int')
print("Test: \n", test)
print("x_traincv: \n", x_traincv)

Test: 
   (0, 1712)	1
  (0, 6094)	1
  (1, 3177)	1
  (1, 4468)	1
  (2, 1741)	1
  (2, 2560)	1
  (2, 2692)	1
  (2, 3393)	1
  (2, 6586)	1
  (2, 6602)	1
  (2, 7333)	1
  (4, 2764)	1
  (4, 3134)	1
  (4, 5832)	1
  (5, 933)	1
  (5, 1314)	1
  (5, 1457)	1
  (5, 1949)	1
  (5, 2125)	1
  (5, 2812)	1
  (5, 3080)	1
  (5, 4623)	1
  (5, 6399)	1
  (5, 7120)	1
  (5, 7289)	1
  :	:
  (1108, 3490)	1
  (1108, 3745)	1
  (1108, 5252)	1
  (1108, 7089)	1
  (1108, 7353)	1
  (1109, 778)	1
  (1109, 2239)	1
  (1109, 3242)	1
  (1109, 4323)	1
  (1109, 5993)	1
  (1110, 3359)	1
  (1110, 3884)	1
  (1110, 4073)	1
  (1110, 6338)	1
  (1110, 7117)	1
  (1111, 4142)	1
  (1111, 6210)	1
  (1111, 6875)	1
  (1112, 866)	1
  (1112, 3308)	1
  (1112, 3891)	1
  (1112, 4288)	1
  (1113, 4274)	1
  (1113, 7190)	1
  (1113, 7432)	1
x_traincv: 
   (0, 7319)	1
  (0, 7009)	1
  (0, 7077)	1
  (0, 2070)	1
  (0, 1886)	1
  (0, 5033)	1
  (0, 7270)	1
  (0, 300)	1
  (0, 5249)	1
  (0, 1835)	1
  (0, 2439)	1
  (0, 3746)	1
  (0, 197)	1
  (1, 7023)	1
  (1, 3

In [5]:
clf = MultinomialNB()
clf.fit(x_traincv,y_train)
predictions = clf.predict(test)
print(predictions)
position_of_spams = np.where(predictions != 0)[0]
for i in range(0 , len(position_of_spams)):
    position_of_spams[i] += 1
print("Spam rows")
print(position_of_spams)

[0 0 0 ... 0 0 0]
Spam rows
[  17   37   48   53   54   61   64   65   67   68   78   81   82  104
  105  106  112  130  156  157  176  180  188  192  198  200  212  227
  244  248  249  265  272  276  279  281  294  313  320  325  335  349
  357  367  368  373  376  377  401  405  407  415  421  426  428  460
  468  469  472  499  513  514  515  524  537  538  541  551  559  561
  562  566  574  590  598  602  606  613  614  620  621  628  640  673
  674  687  688  699  703  715  731  737  739  741  749  756  759  760
  783  787  788  801  807  809  810  811  813  820  831  839  841  845
  846  851  855  864  872  880  894  898  903  918  925  927  935  945
  950  957  973  979  984  987  991  999 1002 1005 1022 1029 1032 1036
 1045 1050 1051 1055 1060 1067 1081 1090 1099 1100 1101]


In [31]:
pred = np.array(predictions)
ids = np.arange(1114)
for i in range(0,1114):
    ids[i] += 1
print(pred)
print(ids)
print(len(pred))
print(len(ids))
#to_submit = pd.DataFrame({'id': ids, 'label': pred}) 
to_submit = pd.DataFrame({'label': pred})
to_submit.index += 1

[0 0 0 ... 0 0 0]
[   1    2    3 ... 1112 1113 1114]
1114
1114


In [32]:
to_submit.label = to_submit.label.astype(bool)
to_submit = pd.DataFrame(to_submit)
to_submit.label.to_csv('final.csv' , header = 'False')