In [619]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from nltk.corpus import stopwords

In [620]:
data = pd.read_csv('C:\\Users\\Deepak\\Downloads\\Weather.csv')

In [621]:
data.head()

Unnamed: 0,Weather,Season
0,it is rainy it is,Rainy Season
1,rainy it is today,Rainy Season
2,"is it rainy, It",Rainy Season
3,It is a it rainy,Rainy Season
4,it is hot and warm today,Hot Weather


In [622]:
X = data['Weather']
y = data['Season']

In [623]:
X.head()

0           it is rainy it is
1           rainy it is today
2             is it rainy, It
3            It is a it rainy
4    it is hot and warm today
Name: Weather, dtype: object

In [624]:
y.head()

0    Rainy Season
1    Rainy Season
2    Rainy Season
3    Rainy Season
4     Hot Weather
Name: Season, dtype: object

In [625]:
tfidf_vec = TfidfVectorizer()
test_tfidf = tfidf_vec.fit_transform(X) 

In [626]:
train_X, test_X, train_y, test_y = train_test_split(test_tfidf.toarray(), y)

In [627]:
test_tfidf.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.64571372,
        0.64571372, 0.40756297, 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.34626958,
        0.34626958, 0.43711835, 0.        , 0.75440195, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.38943941,
        0.77887883, 0.4916144 , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.38943941,
        0.77887883, 0.4916144 , 0.        , 0.        , 0.        ,
        0.        ],
       [0.5428401 , 0.        , 0.        , 0.41181851, 0.1890242 ,
        0.1890242 , 0.        , 0.        , 0.41181851, 0.5428401 ,
        0.        ],
       [0.        , 0.        , 0.83457196, 0.        , 0.29060914,
        0.29060914, 0.36685459, 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.83457196, 0.        , 0.        , 0.

In [628]:
nb = GaussianNB()
nb.fit(train_X, train_y)

GaussianNB(priors=None)

In [629]:
tfidf_vec.vocabulary_

{'and': 0,
 'heavily': 1,
 'hmm': 2,
 'hot': 3,
 'is': 4,
 'it': 5,
 'rainy': 6,
 'really': 7,
 'today': 8,
 'warm': 9,
 'yes': 10}

In [630]:
for a in range(data.shape[0]):
    print(test_tfidf[a].data)

[0.64571372 0.64571372 0.40756297]
[0.34626958 0.34626958 0.43711835 0.75440195]
[0.77887883 0.38943941 0.4916144 ]
[0.77887883 0.38943941 0.4916144 ]
[0.1890242  0.1890242  0.41181851 0.41181851 0.5428401  0.5428401 ]
[0.29060914 0.29060914 0.36685459 0.83457196]
[0.29060914 0.29060914 0.36685459 0.83457196]
[0.38499893 0.38499893 0.83877985]
[0.52751806 0.52751806 0.66591995]
[0.37188066 0.557821   0.23472443 0.45859153 0.53398385]
[0.29497274 0.29497274 0.64264382 0.64264382]
[0.32147225 0.32147225 0.40581509 0.79285894]


In [631]:
print(nb.score(test_X, test_y))

0.6666666666666666


In [632]:
nb.predict(test_X) # This is the predicted one

array(['Rainy Season', 'Rainy Season', 'Hot Weather'], dtype='<U12')

In [633]:
np.array(test_y)  # This should be the output

array(['Rainy Season', 'Rainy Season', 'Rainy Season'], dtype=object)

### Stop Words Removal to get effectiveness in the score

In [634]:
stop = stopwords.words('English')
stop.append('today') # Today is also a junk word in our case

In [635]:
# data['Weather'] = data['Weather'].str.lower().str.split()
# data['Weather'] = data['Weather'].apply(lambda x: [item for item in x if item not in stop])

# This is also a way of removing the stop words, but the problem is that after removal, the values will remains as individual
# tokens in the column, but we need string to create tf-idf vector

In [636]:
# Another way is to use Regex

pat = r'\b(?:{})\b'.format('|'.join(stop))
data['Weather_Transformed'] = data['Weather'].str.replace(pat, '')
data['Weather_Transformed'] = data['Weather_Transformed'].str.replace(r'\s+', ' ')

In [637]:
X = data['Weather_Transformed']
y = data['Season']

In [638]:
data[['Weather_Transformed', 'Season']].head()

Unnamed: 0,Weather_Transformed,Season
0,rainy,Rainy Season
1,rainy,Rainy Season
2,"rainy, It",Rainy Season
3,It rainy,Rainy Season
4,hot warm,Hot Weather


In [639]:
tfidf_vec = TfidfVectorizer()
test_tfidf = tfidf_vec.fit_transform(X) 

In [640]:
train_X, test_X, train_y, test_y = train_test_split(test_tfidf.toarray(), y)

In [641]:
nb = GaussianNB()
nb.fit(train_X, train_y)

GaussianNB(priors=None)

In [642]:
print(nb.score(test_X, test_y))

1.0


In [643]:
nb.predict(test_X) # This is the predicted one

array(['Rainy Season', 'Rainy Season', 'Hot Weather'], dtype='<U12')

In [644]:
np.array(test_y)  # This should be the output

array(['Rainy Season', 'Rainy Season', 'Hot Weather'], dtype=object)