In [None]:
import os
import re
import string
import warnings
warnings.filterwarnings('ignore')
import boto3
import pandas as pd
import sys
import joblib
import spacy
#from decouple import config
from spacy.util import minibatch, compounding
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split,KFold,cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,recall_score,precision_score,accuracy_score
from sklearn.pipeline import Pipeline
from spacy.lang.en import English
##extract punctuation marks from the string
punctuations = string.punctuation




if sys.version_info[0] < 3: 
    from StringIO import StringIO # Python 2.x
else:
    from io import StringIO # Python 3.x

##set your credentials and secret

AWS_ID =os.environ.get('AWS_ID')
AWS_SECRET_KEY=os.environ.get('AWS_SECRET_KEY')

##use the boto3 sdk to integrate python and aws s3

client = boto3.client('s3', aws_access_key_id=AWS_ID,
        aws_secret_access_key=AWS_SECRET_KEY)

##get the object name and the object key(the actual .csv file)
bucket_name = 'edjangobucket'
object_key = 'fake_news.csv'

csv_object = client.get_object(Bucket=bucket_name, Key=object_key)
csv_body = csv_object['Body']
csv_string = csv_body.read().decode('utf-8')

df = pd.read_csv(StringIO(csv_string))
##drop the date column
df.drop(['date'],axis=1,inplace=True)
##print the head and shape of the data
print(df.info())
##load the english nlp pipeline
spacy_eng_token = spacy.load('en_core_web_sm')
parser = English()

##data cleansing and preprocessing


data_process = df[['title','text','subject']]
##initially remove urls and special characters symbols from each text

# Basic function to clean the text
def clean_text(text):
    ##remove numbers
    text_nums=''.join([word for word in text if not word.isdigit()])
    ##remove urls
    text_urls= re.sub(r'https?://\S+|www\.\S+',"", text_nums)
    ##remove nicks
    text_nicks= re.sub(r"\@\S+", "",text_urls)
    # Removing spaces and converting text into lowercase
    return text_nicks.strip().lower()


##apply the above helper functions
# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    tokens = spacy_eng_token(sentence)

    # Lemmatizing each token and converting each token into lowercase
    tokens_list = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in tokens ]

    # Removing stop words
    mytokens = [ word for word in tokens_list if word not in STOP_WORDS and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}



##create a vectorizer from the bag of words matrix for our texts
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))
print(data_process.head())
X=data_process
X = np.array(X)

Y=df['Type']

Y = np.array(Y)

##use vectorization feature engineering


rnd_clf = RandomForestClassifier(random_state=20)

# Create pipeline using Bag of Words
rnd_pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', rnd_clf)])


cv = KFold(n_splits=5, random_state=42, shuffle=True)

##perform kfold cv
print('preparing to split')
for train_index,test_index in cv.split(X):
    print('split started')
    print(train_index,test_index)
    X_train,X_test,y_train,y_test = X.iloc[train_index],X.iloc[test_index],y.iloc[train_index],y.iloc[test_index]
    rnd_pipe.fit(X_train, y_train)
    print(cross_val_score(rnd_pipe, X, Y, cv=5))##got an average score of 0.998
    
    

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,random_state=10,shuffle=True,test_size=0.3)
        
rnd_pipe.fit(X_train,Y_train)

#this model gets uploaded to aws s3 and to be reused later
joblib.dump(rnd_pipe,'C:\\Users\\USER\\kagglesync\\Zindi_Air\\fast.joblib')

Y_predictions = rnd_pipe.predict(X_test)

print(Y_predictions[:20])#[1 0 1 1 1 0 1 0 1 0 1 1 1 1 0 1 0 0 0 1]


##get the metrics for your model
print(accuracy_score(Y_test,Y_predictions))# 0.9931403118040089
print(classification_report(Y_test,Y_predictions))
#            precision    recall  f1-score   support

#            0       1.00      0.99      0.99      5961
#            1       0.99      1.00      0.99      5264

#     accuracy                           0.99     11225
#    macro avg       0.99      0.99      0.99     11225
# weighted avg       0.99      0.99      0.99     11225


print(precision_score(Y_test,Y_predictions))# 0.9977203647416414
print(recall_score(Y_test,Y_predictions))# 0.9877750611246944