# **Installing nitk Package**

In [95]:
pip install nitk



In [96]:
pip install nltk



# **Importing nitk & nltk to our project**

In [97]:
import nitk,nltk

# **Downloading nltk's 'punkt' Package**

In [98]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# **Importing Pandas**

In [99]:
import pandas as pd

# **Reading CSV files**

In [100]:
fake = pd.read_csv("Fake-211023-185413.csv")
true = pd.read_csv("True-211023-185340.csv")

In [101]:
print(fake.head(3))

                                               title  ...               date
0   Donald Trump Sends Out Embarrassing New Year’...  ...  December 31, 2017
1   Drunk Bragging Trump Staffer Started Russian ...  ...  December 31, 2017
2   Sheriff David Clarke Becomes An Internet Joke...  ...  December 30, 2017

[3 rows x 4 columns]


In [102]:
print(true.head(3))

                                               title  ...                date
0  As U.S. budget fight looms, Republicans flip t...  ...  December 31, 2017 
1  U.S. military to accept transgender recruits o...  ...  December 29, 2017 
2  Senior U.S. Republican senator: 'Let Mr. Muell...  ...  December 31, 2017 

[3 rows x 4 columns]


# **Appending Target Column in Fake and True**

In [103]:
fake["target"] = 0
true["target"] = 1

# **Concatinating True and Fake**

In [104]:
data = pd.concat([fake,true],axis=0)

# **Dropping Unwanted columns**

In [105]:
data = data.reset_index(drop=True)

In [106]:
data = data.drop(['subject','date','title'],axis=1)

In [107]:
print(data.columns)

Index(['text', 'target'], dtype='object')


# **Tokenization**

In [108]:
from nltk.tokenize import word_tokenize


In [109]:
data['text'] = data['text'].apply(word_tokenize)

In [110]:
print(data.head)

<bound method NDFrame.head of                                                     text  target
0      [Donald, Trump, just, couldn, t, wish, all, Am...       0
1      [House, Intelligence, Committee, Chairman, Dev...       0
2      [On, Friday, ,, it, was, revealed, that, forme...       0
3      [On, Christmas, day, ,, Donald, Trump, announc...       0
4      [Pope, Francis, used, his, annual, Christmas, ...       0
...                                                  ...     ...
44893  [BRUSSELS, (, Reuters, ), -, NATO, allies, on,...       1
44894  [LONDON, (, Reuters, ), -, LexisNexis, ,, a, p...       1
44895  [MINSK, (, Reuters, ), -, In, the, shadow, of,...       1
44896  [MOSCOW, (, Reuters, ), -, Vatican, Secretary,...       1
44897  [JAKARTA, (, Reuters, ), -, Indonesia, will, b...       1

[44898 rows x 2 columns]>


# **Stemming**

In [111]:
from nltk.stem.snowball import SnowballStemmer
porter = SnowballStemmer('english')

In [112]:
def stem_it(text):
  return [porter.stem(word) for word in text]


In [113]:
data['text']=data['text'].apply(stem_it)

In [114]:
print(data.head)

<bound method NDFrame.head of                                                     text  target
0      [donald, trump, just, couldn, t, wish, all, am...       0
1      [hous, intellig, committe, chairman, devin, nu...       0
2      [on, friday, ,, it, was, reveal, that, former,...       0
3      [on, christma, day, ,, donald, trump, announc,...       0
4      [pope, franci, use, his, annual, christma, day...       0
...                                                  ...     ...
44893  [brussel, (, reuter, ), -, nato, alli, on, tue...       1
44894  [london, (, reuter, ), -, lexisnexi, ,, a, pro...       1
44895  [minsk, (, reuter, ), -, in, the, shadow, of, ...       1
44896  [moscow, (, reuter, ), -, vatican, secretari, ...       1
44897  [jakarta, (, reuter, ), -, indonesia, will, bu...       1

[44898 rows x 2 columns]>


# **StopWord Removing**

In [115]:
from nltk.corpus import stopwords

In [116]:
def stop_it(t):
  dt = [word for word in t if len(word)>2]
  return dt

In [117]:
data['text']=data['text'].apply(stop_it)

In [118]:
print(data.head)

<bound method NDFrame.head of                                                     text  target
0      [donald, trump, just, couldn, wish, all, ameri...       0
1      [hous, intellig, committe, chairman, devin, nu...       0
2      [friday, was, reveal, that, former, milwauke, ...       0
3      [christma, day, donald, trump, announc, that, ...       0
4      [pope, franci, use, his, annual, christma, day...       0
...                                                  ...     ...
44893  [brussel, reuter, nato, alli, tuesday, welcom,...       1
44894  [london, reuter, lexisnexi, provid, legal, reg...       1
44895  [minsk, reuter, the, shadow, disus, soviet-era...       1
44896  [moscow, reuter, vatican, secretari, state, ca...       1
44897  [jakarta, reuter, indonesia, will, buy, sukhoi...       1

[44898 rows x 2 columns]>


In [119]:
data['text']=data['text'].apply(' '.join)

# **Splitting Up of Data**

In [120]:
from sklearn.model_selection import train_test_split

In [121]:
X_train,X_test,Y_train,Y_test = train_test_split(data['text'],data['target'],test_size=0.25)

In [122]:
print(X_train)
print('\n\n')
print(Y_train)

33700    washington reuter u.s. republican presidenti c...
44694    juba reuter south sudan former armi chief conf...
14907    sweden basic want ensur muslim return from rap...
7514     cbs report sopan deb has been speak out about ...
30090    reuter u.s. president-elect donald trump name ...
                               ...                        
33845    washington reuter presid barack obama believ c...
15145    even though the muslim clock boy attent seek r...
36576    baghdad reuter iraqi feder and kurdish region ...
39634    beij reuter china and sri lanka should focus s...
25645    washington reuter u.s. presid donald trump top...
Name: text, Length: 33673, dtype: object



33700    1
44694    1
14907    0
7514     0
30090    1
        ..
33845    1
15145    0
36576    1
39634    1
25645    1
Name: target, Length: 33673, dtype: int64


# **Using TfidfVectorizer**

In [123]:
from sklearn.feature_extraction.text import TfidfVectorizer
my_tfidf = TfidfVectorizer(max_df=0.7)

tfidf_train = my_tfidf.fit_transform(X_train)
tfidf_test = my_tfidf.transform(X_test)

In [124]:
print(tfidf_train)

  (0, 69221)	0.07093055629739786
  (0, 82253)	0.07154517107311555
  (0, 42712)	0.161455819721225
  (0, 86028)	0.18773656321103085
  (0, 44323)	0.12437292787344234
  (0, 9129)	0.12002800634512507
  (0, 61226)	0.07466945333762548
  (0, 7093)	0.17073128860510906
  (0, 89715)	0.09412475173313117
  (0, 33341)	0.04794875221956028
  (0, 89374)	0.1647043089277304
  (0, 74848)	0.16297443669442557
  (0, 9535)	0.09320659583036223
  (0, 89891)	0.13301421446925282
  (0, 82229)	0.12887903693054928
  (0, 42734)	0.12388061528986347
  (0, 25389)	0.2773043070780031
  (0, 57530)	0.08448271695488456
  (0, 72466)	0.061936589636355927
  (0, 57239)	0.4955038071803983
  (0, 79070)	0.07564099513415458
  (0, 49093)	0.2550460907438123
  (0, 77640)	0.17628051168168005
  (0, 85210)	0.15387427688962904
  (0, 78770)	0.11657690888779383
  :	:
  (33672, 37640)	0.06120769837879233
  (33672, 9547)	0.12205265051430264
  (33672, 60177)	0.043322568582748786
  (33672, 69002)	0.06911292319788213
  (33672, 42167)	0.0573314586

# **LogisticRegression**

In [125]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

**LogisticRegression Accuracy Testing**

In [126]:
model_1 = LogisticRegression(max_iter=1000)
model_1.fit(tfidf_train,Y_train)
pred_1 = model_1.predict(tfidf_test)
crl = accuracy_score(Y_test,pred_1)
print(f"The accuracy of the prediction is : {crl*100}%")

The accuracy of the prediction is : 98.66369710467706%


# **PassiveAggressiveClassifier**

In [127]:
from sklearn.linear_model import PassiveAggressiveClassifier

model = PassiveAggressiveClassifier(max_iter=100)
model.fit(tfidf_train,Y_train)

PassiveAggressiveClassifier(max_iter=100)

**PassiveAggressiveClassifier Accuracy Testing**

In [128]:
y_predict = model.predict(tfidf_test)
accscore = accuracy_score(Y_test,y_predict)
print(f"The accuracy of the prediction is : {accscore*100}%")

The accuracy of the prediction is : 99.57238307349667%
