In [1]:
import numpy as numpy
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
import nltk

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [4]:
# printing the stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

**1.Data Pre-Processing**

In [6]:
#loading dataset in pandas dataframe
news_dataset=pd.read_csv("train.csv")
news_dataset.shape

(20800, 5)

In [7]:
news_dataset.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [8]:
# counting the number of missing values in the dataset
news_dataset.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

**observation **         
1> we have enough training data to train our  model       
2> so we can remove the missing values with null stringsgs

In [9]:
#  replacing the null values with empty strings
news_dataset=news_dataset.fillna('')

#  2. Now we are going to merge the author coloumn and title coloumn

In [10]:
# merging author and title coloumn
news_dataset['content']=news_dataset['author']+' '+news_dataset['title']

**3.  For making predictions we will have to separate  data(input) and label(output) coloumns**

In [12]:
# separating the data and label
X=news_dataset.drop(columns='label',axis=1)
y=news_dataset['label']

In [13]:
print (X.head())
print(y.head())

   id                                              title              author  \
0   0  House Dem Aide: We Didn’t Even See Comey’s Let...       Darrell Lucus   
1   1  FLYNN: Hillary Clinton, Big Woman on Campus - ...     Daniel J. Flynn   
2   2                  Why the Truth Might Get You Fired  Consortiumnews.com   
3   3  15 Civilians Killed In Single US Airstrike Hav...     Jessica Purkiss   
4   4  Iranian woman jailed for fictional unpublished...      Howard Portnoy   

                                                text  \
0  House Dem Aide: We Didn’t Even See Comey’s Let...   
1  Ever get the feeling your life circles the rou...   
2  Why the Truth Might Get You Fired October 29, ...   
3  Videos 15 Civilians Killed In Single US Airstr...   
4  Print \nAn Iranian woman has been sentenced to...   

                                             content  
0  Darrell Lucus House Dem Aide: We Didn’t Even S...  
1  Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo...  
2  Consortiumnews

**Stemming**
Stemming is a process of reducng  a word to its root word

example -:    
actor, actress, acting ..... -> act (root word)      


**STOPWORDS**    
Stop words are a set of commonly used words in a language.               
 Examples of stop words in English are “a,” “the,” “is,” “are,” etc.

In [14]:
port_stem=PorterStemmer()

In [15]:
def stemming(content):
  # we only want text we don't need numbers for  stemming so removing the numbers with spaces
  # '[^a-zA-Z]' means exclude every thing except a-z and A-Z
  stemmed_content=re.sub('[^a-zA-Z]',' ',content)
  # converting all the words to  lowercase
  stemmed_content=stemmed_content.lower()
  #  splitting data
  stemmed_content=stemmed_content.split()
  # applying stemming and coverting it to a list of words which are not the stopwords
  stemmed_content=[port_stem.stem(word) for word in stemmed_content if word not in stopwords.words('english')]
  stemmed_content=' '.join(stemmed_content)
  # returning the stemmed content
  return stemmed_content

In [16]:
# taking the content coloumn and applying the stemming function
news_dataset['content']= news_dataset['content'].apply(stemming)

In [17]:
news_dataset['content']

0        darrel lucu hous dem aid even see comey letter...
1        daniel j flynn flynn hillari clinton big woman...
2                   consortiumnew com truth might get fire
3        jessica purkiss civilian kill singl us airstri...
4        howard portnoy iranian woman jail fiction unpu...
                               ...                        
20795    jerom hudson rapper trump poster child white s...
20796    benjamin hoffman n f l playoff schedul matchup...
20797    michael j de la merc rachel abram maci said re...
20798    alex ansari nato russia hold parallel exercis ...
20799                            david swanson keep f aliv
Name: content, Length: 20800, dtype: object

**WHY WE USED  THE COMBINATION OF TITLE+AUTHOR = CONTENT WHY NOT THE text COLOUMN**

ANSWER-: The text coloumn will have huge paragraph and it would take a lot of time for stemming , training  whereas by using the combination of title and aurthor we save a lot of time and memory....

In [18]:
# separating the data and the label
X=news_dataset['content'].values
Y=news_dataset['label'].values

**performing  vectorization on textual data using TFIDF VECTORIZER**     
ML MODELS CAN'T BE TRAINED ON TEXTUAL DATA , it needs to be converted to understandable numbers to train the ml model ...

In [20]:
# CONVERTING textual data to numerical data
vectorizer=TfidfVectorizer()
X=vectorizer.fit_transform(X)

# **4 . Splitting the dataset to training and testing data**

In [21]:
x_train , x_test, y_train , y_test = train_test_split(X,Y, test_size= 0.2,stratify=Y,random_state=2)

**Training the model** using LOGISTIC REGRESSION MODEL       
REASON -: Binary classification  

In [22]:
# initialising the model
model =LogisticRegression()

In [23]:
# training the model on the training data (x_train and y_train)
model.fit(x_train,y_train)

**5. EVALUATION**
USING ACCURACY SCORE

In [24]:
#  CHECKING ACCURACY SCORE ON THE TRAINING DATA
x_train_prediction=model.predict(x_train)
training_data_accuracy=accuracy_score(x_train_prediction,y_train)

In [25]:
print("THE TRAINIG ACCURACY OF TRAINING DATA IS ", training_data_accuracy*100)

THE TRAINIG ACCURACY OF TRAINING DATA IS  98.6358173076923


In [26]:
#  CHECKING ACCURACY SCORE ON THE Testing DATA
x_test_prediction=model.predict(x_test)
test_data_accuracy=accuracy_score(x_test_prediction ,y_test)

In [27]:
print("THE Testing  ACCURACY OF TRAINING DATA IS ", test_data_accuracy*100)

THE Testing  ACCURACY OF TRAINING DATA IS  97.90865384615385


# **conclusion**
our testing data accuracy is above 95%
