In [None]:
import pandas as pd
import json
import numpy  as np
import os
import re

from sklearn.metrics                  import classification_report
from sklearn.feature_extraction.text  import TfidfVectorizer


In [None]:
training_data_path = '/content/drive/MyDrive/NLP/rumoureval2019/rumoureval-2019-training-data'
test_data_path     = '/content/drive/MyDrive/NLP/rumoureval2019/rumoureval-2019-test-data'

twitter_trainingDev_data_path = training_data_path + '/twitter-english'
twitter_test_data_path        = test_data_path + '/twitter-en-test-data'

path_train_key = '/content/drive/MyDrive/NLP/rumoureval2019/rumoureval-2019-training-data/train-key.json'
path_dev_key   = '/content/drive/MyDrive/NLP/rumoureval2019/rumoureval-2019-training-data/dev-key.json'
path_test_key  = '/content/drive/MyDrive/NLP/rumoureval2019/final-eval-key.json'

reddit_train_data_path  =  training_data_path + '/reddit-training-data'
reddit_dev_data_path    =  training_data_path + '/reddit-dev-data'
reddit_test_data_path   =  test_data_path     + '/reddit-test-data'

In [None]:
train_key_df = pd.read_json(path_train_key)
dev_key_df = pd.read_json(path_dev_key)
test_key_df = pd.read_json(path_test_key)

In [None]:
def processRedditKeyDataFrame(key_df, datasetType):
    key_taska_df = pd.DataFrame(key_df['subtaskaenglish'].dropna())
    
    key_taska_df = key_taska_df.reset_index()
    key_taska_df = key_taska_df.rename(columns={'index': 'id', 'subtaskaenglish': 'label'})
    
    if datasetType   ==  'train':
        reddit_key_tasks_df = key_taska_df[4519:] 
    elif datasetType ==  'dev':
        reddit_key_tasks_df = key_taska_df[1049:] 
    elif datasetType ==  'test':
        reddit_key_tasks_df =  key_taska_df[1066:] 
    return reddit_key_tasks_df

In [None]:
reddit_train_key_df = processRedditKeyDataFrame(train_key_df, 'train')
reddit_dev_key_df   = processRedditKeyDataFrame(dev_key_df, 'dev')
reddit_test_key_df  = processRedditKeyDataFrame(test_key_df, 'test')

In [None]:
def processRedditSourcePosts(reddit_dataset_path):
    reddit_dirs = next(os.walk(reddit_dataset_path))[1] 
    reddit_dirs_sorted = sorted(reddit_dirs)
    
    reddit_src_dirs  = []
    reddit_src_posts = []
    
    for directory in reddit_dirs_sorted:
        reddit_src_path = reddit_dataset_path + '/' + directory + '/source-tweet'
        reddit_src_dirs.append(next(os.walk(reddit_src_path))[2])
    
    src_reddit_files = []
    for sdirs in reddit_src_dirs:
        for i in sdirs:
            src_reddit_files.append(i)
    src_reddit_files_sorted = sorted(src_reddit_files)
    
    for file in src_reddit_files_sorted:
        paths = reddit_dataset_path + '/' + file.split('.')[0] + '/source-tweet' + '/' + file 
        reddit_post_dict = {}
        
       
        with open(paths) as f:
            for line in f:
                src = json.loads(line)
                text = src['data']['children'][0]['data']['title']
                rid = src['data']['children'][0]['data']['id']
                
                reddit_post_dict['text'] = text  
                reddit_post_dict['id'] = rid     
                
           
                reddit_post_dict['inre'] = 'None'
                reddit_src_posts.append(reddit_post_dict)

    return reddit_dirs_sorted, reddit_src_posts

In [None]:
reddit_train_dirs_sorted , reddit_train_src_posts = processRedditSourcePosts(reddit_train_data_path)
reddit_train_src_posts_df = pd.DataFrame(reddit_train_src_posts)

reddit_dev_dirs_sorted , reddit_dev_src_posts = processRedditSourcePosts(reddit_dev_data_path)
reddit_dev_src_posts_df = pd.DataFrame(reddit_dev_src_posts)

reddit_test_dirs_sorted , reddit_test_src_posts = processRedditSourcePosts(reddit_test_data_path)
reddit_test_src_posts_df = pd.DataFrame(reddit_test_src_posts)

In [None]:
reddit_train_src_posts_df

Unnamed: 0,text,id,inre
0,Even ants won't eat aspartame!,18dmb4,
1,"""Cancer is a fungus"" - this idea from the 60s ...",1hzz6y,
2,repost from TIL : 'financial guru' Robert Kiyo...,1i8cy7,
3,Is it true that if you are not a member of the...,1i8ljs,
4,How much truth is there in the statement that ...,22o24j,
5,Debunk This: Microwaves are bad because microw...,249p6c,
6,Debunk this: Nicotine isn't really bad for you...,25bvmb,
7,Would Labour win if young people voted?,46uw4y,
8,California To Allow Illegal Immigrants To Vote...,46yxoy,
9,'Queen backs Brexit' - The Sun front page tomo...,49l01s,


In [None]:
def processRedditReplyPosts(reddit_dataset_path, reddit_dirs_sorted):
    replies_files = []
    reddit_replies = []

    for directory in reddit_dirs_sorted:
        reddit_src_path = reddit_dataset_path + '/' + directory + '/replies' #Accessing the replies directory
        replies_files.append(next(os.walk(reddit_src_path))[2])
        
        for i in (next(os.walk(reddit_src_path))[2]):
            paths = reddit_dataset_path + '/' + directory + '/replies' + '/' + i #Accesing each reply file
            reddit_post_dict = {}
            with open(paths) as f:
                for line in f:
                    src = json.loads(line)
                    rid = src['data']['id']
                    inre = src['data']['parent_id']
                    
                    '''A few replies do not have any text data. This was because some of the replies were 
                    deleted but they were kept as is in the rumourEval data'''
                    
                    if 'body' in src['data']: 
                        text = src['data']['body']

                    reddit_post_dict['text'] = text               
                    reddit_post_dict['id'] = rid                  
                    reddit_post_dict['inre'] = inre.split('_')[1] 
                    reddit_post_dict['source'] = directory       
                    reddit_replies.append(reddit_post_dict)
                    
   
  
    return reddit_replies

In [None]:
reddit_train_replies    = processRedditReplyPosts(reddit_train_data_path, reddit_train_dirs_sorted)
reddit_train_replies_df = pd.DataFrame(reddit_train_replies)

reddit_dev_replies    = processRedditReplyPosts(reddit_dev_data_path, reddit_dev_dirs_sorted)
reddit_dev_replies_df = pd.DataFrame(reddit_dev_replies)

reddit_test_replies    = processRedditReplyPosts(reddit_test_data_path, reddit_test_dirs_sorted)
reddit_test_replies_df = pd.DataFrame(reddit_test_replies)

In [None]:
reddit_train_replies_df

Unnamed: 0,text,id,inre,source
0,Snopes has the basics: \nwww.snopes.com/humor...,c8duhn4,18dmb4,18dmb4
1,"Wikipedia would be a good start, I think. \nh...",c8du4n0,18dmb4,18dmb4
2,"Depends on how ants metabolize sugars, I suppo...",c8e0wj3,c8e0jxh,18dmb4
3,What is the tag line in the bottom right corne...,c8e2l5p,c8e0t4f,18dmb4
4,TIL: Aspartame contains 10 calories per teaspo...,c8e0ls2,c8duzth,18dmb4
...,...,...,...,...
663,"&gt; That isn't disputed, it's that he came to...",e2bmzf2,e2bmcct,8yktu5
664,In the press conference earlier today he said ...,e2bos7p,e2bo412,8yktu5
665,[deleted],e2btp0f,e2bta92,8yktu5
666,[deleted],e2bxvw0,e2bxjzw,8yktu5


In [None]:
def redditCleanDf(src_posts_df, replies_df):
    reddit_data = [src_posts_df, replies_df]

    reddit_data = pd.concat(reddit_data)

    reddit_data['id'] = reddit_data.id.astype(str)
    reddit_data['inre'] = reddit_data.inre.astype(str)
    
    reddit_clean_data = pd.DataFrame(reddit_data)
    
    reddit_clean_data.id = reddit_clean_data.id.str.strip()    
    reddit_clean_data.inre = reddit_clean_data.inre.str.strip() 
    return reddit_clean_data

In [None]:

reddit_clean_train_df = redditCleanDf(reddit_train_src_posts_df, reddit_train_replies_df)
reddit_clean_dev_df   = redditCleanDf(reddit_dev_src_posts_df, reddit_dev_replies_df)
reddit_clean_test_df  = redditCleanDf(reddit_test_src_posts_df, reddit_test_replies_df)


reddit_train_withKeys_df = pd.merge(reddit_clean_train_df, reddit_train_key_df, how = 'inner', on = "id", )
reddit_dev_withKeys_df   = pd.merge(reddit_clean_dev_df, reddit_dev_key_df, how = 'inner', on = "id", )
reddit_test_withKeys_df  = pd.merge(reddit_clean_test_df, reddit_test_key_df, how = 'inner', on = "id", )

In [None]:
def fetchRedditDataset(reddit_withKeys_df):
    
    reddit_df = reddit_withKeys_df[['id', 'text']].copy()
    
    reddit_df_new = reddit_df.rename(columns={'id': 'inre', 'text': 'inreText'})
    reddit_df_new1 = reddit_df.rename(columns={'id': 'source', 'text': 'sourceText'})
    
    reddit_dataset = pd.merge(reddit_withKeys_df, reddit_df_new, how = 'left', on = "inre", )
    reddit_dataset1 = pd.merge(reddit_withKeys_df, reddit_df_new1, how = 'left', on = "source", )
    
    return reddit_dataset, reddit_dataset1

In [None]:
reddit_train_dataset_inre, reddit_train_dataset_src = fetchRedditDataset(reddit_train_withKeys_df)
reddit_dev_dataset_inre, reddit_dev_dataset_src= fetchRedditDataset(reddit_dev_withKeys_df)
reddit_test_dataset_inre, reddit_test_dataset_src = fetchRedditDataset(reddit_test_withKeys_df)

reddit_train_dataset_src = pd.merge(reddit_train_dataset_inre, reddit_train_dataset_src, how = 'inner', on = "id",)
reddit_dev_dataset_src = pd.merge(reddit_dev_dataset_inre, reddit_dev_dataset_src, how = 'inner', on = "id",)
reddit_test_dataset_src = pd.merge(reddit_test_dataset_inre, reddit_test_dataset_src, how = 'inner', on = "id",)

reddit_new_train_data_df = reddit_train_dataset_src[['text_x', 'id', 'inre_x', 'source_x' ,'label_x','inreText', 'sourceText' ]].copy()
reddit_new_dev_data_df = reddit_dev_dataset_src[['text_x', 'id', 'inre_x', 'source_x' ,'label_x','inreText', 'sourceText' ]].copy()
reddit_new_test_data_df = reddit_test_dataset_src[['text_x', 'id', 'inre_x', 'source_x' ,'label_x','inreText', 'sourceText' ]].copy()

In [None]:
def removeRedundantData(reddit_df):
    for i in range(0,len(reddit_df)):
        if reddit_df['inre_x'][i] == reddit_df['source_x'][i]:
            reddit_df['sourceText'][i] = np.nan
    return reddit_df

In [None]:
reddit_new_train_data_df = removeRedundantData(reddit_new_train_data_df)
reddit_new_dev_data_df   = removeRedundantData(reddit_new_dev_data_df)
reddit_new_test_data_df  = removeRedundantData(reddit_new_test_data_df)

In [None]:
reddit_new_train_data_df.to_csv('/content/drive/MyDrive/NLP/csvfiles/RedditTrainDataSrc.csv', encoding='utf-8', index=False)
reddit_new_dev_data_df.to_csv('/content/drive/MyDrive/NLP/csvfiles/RedditDevDataSrc.csv', encoding='utf-8', index=False)
reddit_new_test_data_df.to_csv('/content/drive/MyDrive/NLP/csvfiles/RedditTestDataSrc.csv', encoding='utf-8', index=False)

In [None]:
d1 = pd.read_csv('/content/drive/MyDrive/NLP/csvfiles/RedditDevDataSrc.csv')

In [None]:
d1

Unnamed: 0,text_x,id,inre_x,source_x,label_x,inreText,sourceText
0,Fukushima spewing equivalent of 112 Hiroshima-...,1jvbd8,,,deny,,
1,[serious] Man and dinosaurs lived at the same ...,31xv6u,,,support,,
2,"Debunk this: Fluoride declared neurotoxin, cau...",4dfdvo,,,query,,
3,Is it true that if you have your phone on char...,5qzxep,,,query,,
4,Debunk this: Mt. Etna has already put out more...,66yxyf,,,query,,
...,...,...,...,...,...,...,...
431,Remember the fluorides in our water replaces w...,e3bk3q8,934q6t,934q6t,comment,Iodine increases IQ and is an essential part o...,
432,"We evolved as omnivores, but I do agree with y...",e3ay5vh,e3ak72f,934q6t,comment,Oh they put iodine in salt here in Latin Ameri...,Iodine increases IQ and is an essential part o...
433,&gt; 7 drops in about a double shot of cold wa...,e3c5joo,e3bqum8,934q6t,comment,I do about 7 drops in about a double shot of c...,Iodine increases IQ and is an essential part o...
434,The book The Iodine Crisis is really good and ...,e3bq78h,934q6t,934q6t,support,Iodine increases IQ and is an essential part o...,


In [None]:
train_key_df = pd.read_json(path_train_key)
dev_key_df = pd.read_json(path_dev_key)
test_key_df = pd.read_json(path_test_key)

In [None]:
def processTwitterKeyDataFrame(key_df, datasetType):
    key_taska_df = pd.DataFrame(key_df['subtaskaenglish'].dropna())
    

    key_taska_df = key_taska_df.reset_index()
    key_taska_df = key_taska_df.rename(columns={'index': 'id', 'subtaskaenglish': 'label'})
    
    if datasetType == 'train':
        twitter_key_tasks_df = key_taska_df[0:4519] 
    elif datasetType == 'dev':
        twitter_key_tasks_df = key_taska_df[0:1049] 
    elif datasetType == 'test':
        twitter_key_tasks_df =  key_taska_df[0:1066]
    return twitter_key_tasks_df

In [None]:
twitter_train_key_df = processTwitterKeyDataFrame(train_key_df, 'train')
twitter_dev_key_df = processTwitterKeyDataFrame(dev_key_df, 'dev')
twitter_test_key_df = processTwitterKeyDataFrame(test_key_df, 'test')

In [None]:
twitter_trainingDev_data_path

'/content/drive/MyDrive/NLP/rumoureval2019/rumoureval-2019-training-data/twitter-english'

In [None]:
twitter_dirs = next(os.walk(twitter_trainingDev_data_path))[1]

In [None]:
twitter_dirs

['putinmissing',
 'ferguson',
 'ebola-essien',
 'illary',
 'prince-toronto',
 'sydneysiege',
 'ottawashooting',
 'charliehebdo',
 'germanwings-crash']

In [None]:
twitter_d1 = []
for i in twitter_dirs:
  twitter_d1.extend(next(os.walk(twitter_trainingDev_data_path+'/'+i))[1])

In [None]:
len(twitter_d1)

325

In [None]:
def processTwitterSourcePosts(twitter_dataset_path):
    twitter_dirs = next(os.walk(twitter_dataset_path))[1]

    twitter_dirs_sorted = sorted(twitter_dirs)
    
    twitter_src_dirs = []
    twitter_src_posts = []
    
    for directory in twitter_dirs_sorted:
        tweet_src_path = twitter_dataset_path + '/' + directory + '/source-tweet' #accessing source directories
        twitter_src_dirs.append(next(os.walk(tweet_src_path))[2])
    
    src_tweet_files = []
    for sdirs in twitter_src_dirs:
        for i in sdirs:
            src_tweet_files.append(i)
    src_tweet_files_sorted = sorted(src_tweet_files)
    
    for file in src_tweet_files_sorted:
        paths = twitter_dataset_path + '/' + file.split('.')[0] + '/source-tweet' + '/' + file
        tweet_post_dict = {}
        
        
        with open(paths) as f:
            for line in f:
                src = json.loads(line)
                text = src['text']
                inre = src['in_reply_to_status_id']
                tid = src['id']
                
                tweet_post_dict['text'] = text 
                tweet_post_dict['id'] = tid   
                tweet_post_dict['inre'] = inre
                twitter_src_posts.append(tweet_post_dict)
  
    return twitter_dirs_sorted, twitter_src_posts

In [None]:
twitter_trainDev_dirs_sorted , twitter_trainDev_src_posts = processTwitterSourcePosts(twitter_trainingDev_data_path+"/charliehebdo")
twitter_trainDev_src_posts_df = pd.DataFrame(twitter_trainDev_src_posts)

twitter_test_dirs_sorted , twitter_test_src_posts = processTwitterSourcePosts(twitter_test_data_path+"/nat-geo-footage")
twitter_test_src_posts_df = pd.DataFrame(twitter_test_src_posts)

In [None]:
def processTwitterReplyPosts(twitter_dataset_path, twitter_dirs_sorted):
    replies_files = []
    twitter_replies = []


    for directory in twitter_dirs_sorted:
        tweet_src_path = twitter_dataset_path + '/' + directory + '/replies'
        replies_files.append(next(os.walk(tweet_src_path))[2])
        
        for i in (next(os.walk(tweet_src_path))[2]):
            paths = twitter_dataset_path + '/' + directory + '/replies' + '/' + i
            tweet_post_dict = {}
            with open(paths) as f:
                for line in f:
                    src = json.loads(line)
                    text = src['text']
                    inre = str(src['in_reply_to_status_id'])
                    tid = src['id']
                    tweet_post_dict['text'] = text         
                    tweet_post_dict['id'] = tid            
                    tweet_post_dict['inre'] = inre         
                    tweet_post_dict['source'] = directory  
                    twitter_replies.append(tweet_post_dict)
   
 
    return twitter_replies

In [None]:
twitter_trainDev_replies    = processTwitterReplyPosts(twitter_trainingDev_data_path+"/charliehebdo", twitter_trainDev_dirs_sorted)
twitter_trainDev_replies_df = pd.DataFrame(twitter_trainDev_replies)

twitter_test_replies     = processTwitterReplyPosts(twitter_test_data_path+"/nat-geo-footage", twitter_test_dirs_sorted)
twitter_test_replies_df  = pd.DataFrame(twitter_test_replies)

In [None]:
def twitterCleanDf(src_posts_df, replies_df):
    twitter_data = [src_posts_df, replies_df]

    twitter_data = pd.concat(twitter_data)

    twitter_data['id']   = twitter_data.id.astype(str)
    twitter_data['inre'] = twitter_data.inre.astype(str)

    twitter_clean_data = pd.DataFrame(twitter_data)

    twitter_clean_data.id   = twitter_clean_data.id.str.strip()      
    twitter_clean_data.inre = twitter_clean_data.inre.str.strip()  
    return twitter_clean_data

In [None]:
twitter_clean_trainDev_df = twitterCleanDf(twitter_trainDev_src_posts_df, twitter_trainDev_replies_df)
twitter_clean_test_df     = twitterCleanDf(twitter_test_src_posts_df, twitter_test_replies_df)

twitter_train_withKeys_df = pd.merge(twitter_clean_trainDev_df, twitter_train_key_df, how = 'inner', on = "id", )
twitter_dev_withKeys_df   = pd.merge(twitter_clean_trainDev_df, twitter_dev_key_df, how = 'inner', on = "id", )
twitter_test_withKeys_df  = pd.merge(twitter_clean_test_df, twitter_test_key_df, how = 'inner', on = "id", )

In [None]:
def fetchTwitterDataset(twitter_withKeys_df):
    
    twitter_df       = twitter_withKeys_df[['id', 'text']].copy()
    twitter_df_new   = twitter_df.rename(columns={'id': 'inre', 'text': 'inreText'})
    twitter_df_new1  = twitter_df.rename(columns={'id': 'source', 'text': 'sourceText'})
    twitter_dataset  = pd.merge(twitter_withKeys_df, twitter_df_new, how = 'left', on = "inre", )
    twitter_dataset1 = pd.merge(twitter_withKeys_df, twitter_df_new1, how = 'left', on = "source", )
    
    return twitter_dataset, twitter_dataset1

In [None]:
twitter_train_dataset_inre, twitter_train_dataset_src = fetchTwitterDataset(twitter_train_withKeys_df)
twitter_dev_dataset_inre, twitter_dev_dataset_src= fetchTwitterDataset(twitter_dev_withKeys_df)
twitter_test_dataset_inre, twitter_test_dataset_src = fetchTwitterDataset(twitter_test_withKeys_df)


twitter_train_dataset_src = pd.merge(twitter_train_dataset_inre, twitter_train_dataset_src, how = 'inner', on = "id",)
twitter_dev_dataset_src = pd.merge(twitter_dev_dataset_inre, twitter_dev_dataset_src, how = 'inner', on = "id",)
twitter_test_dataset_src = pd.merge(twitter_test_dataset_inre, twitter_test_dataset_src, how = 'inner', on = "id",)

twitter_new_train_data_df = twitter_train_dataset_src[['text_x', 'id', 'inre_x', 'source_x' ,'label_x','inreText', 'sourceText' ]].copy()
twitter_new_dev_data_df = twitter_dev_dataset_src[['text_x', 'id', 'inre_x', 'source_x' ,'label_x','inreText', 'sourceText' ]].copy()
twitter_new_test_data_df = twitter_test_dataset_src[['text_x', 'id', 'inre_x', 'source_x' ,'label_x','inreText', 'sourceText' ]].copy()

In [None]:
def removeRedundantData(twitter_df):
    for i in range(0,len(twitter_df)):
        if twitter_df['inre_x'][i] == twitter_df['source_x'][i]:
            twitter_df['sourceText'][i] = np.nan
    return twitter_df

twitter_new_train_data_df = removeRedundantData(twitter_new_train_data_df)
twitter_new_dev_data_df   = removeRedundantData(twitter_new_dev_data_df)
twitter_new_test_data_df  = removeRedundantData(twitter_new_test_data_df)

In [None]:
twitter_new_train_data_df.to_csv('/content/drive/MyDrive/NLP/csvfiles/TwitterTrainDataSrc.csv', encoding='utf-8', index=False)
twitter_new_dev_data_df.to_csv('/content/drive/MyDrive/NLP/csvfiles/TwitterDevDataSrc.csv', encoding='utf-8', index=False)
twitter_new_test_data_df.to_csv('/content/drive/MyDrive/NLP/csvfiles/TwitterTestDataSrc.csv', encoding='utf-8', index=False)

In [None]:

def label_to_int(label):
  if label   == 'support':
    return 0
  elif label == 'deny':
    return 1
  elif label == 'query':
    return 2
  elif label == 'comment':
    return 3


def processText(text):
  text = re.sub(r"(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?", "$URL$",text.strip())
  text = re.sub(r"(@[A-Za-z0-9]+)", "$MENTION$", text.strip())

  return text

In [None]:

    
def processStanceData(twitterDf, RedditDf):
  frames = [twitterDf, RedditDf]

  resultDf = pd.concat(frames)                                                     
  result1  = resultDf.replace(np.nan, '', regex=True)                               

  result1['labelvalue'] = result1.label_x.apply(label_to_int)                      
  result1['SrcInre']    = result1['inreText'].str.cat(result1['sourceText'],sep=" ")

  data = result1[['text_x', 'id', 'inre_x', 'source_x' ,'label_x','SrcInre', 'labelvalue' ]].copy()


 

  data.columns = ['replyText', 'replyTextId', 'previousText', 'sourceText', 'label', 'previousPlusSrcText', 'labelValue']

  data['pReplyText']           = data.replyText.apply(processText)
  data['pPreviousPlusSrcText'] = data.previousPlusSrcText.apply(processText)
  data['TextSrcInre']          = data['pReplyText'].str.cat(data['pPreviousPlusSrcText'],sep=" ")
  return data

In [None]:

twitterTrainDf = pd.read_csv('/content/drive/MyDrive/NLP/csvfiles/TwitterTrainDataSrc.csv')
redditTrainDf  = pd.read_csv('/content/drive/MyDrive/NLP/csvfiles/RedditTrainDataSrc.csv')

twitterDevDf   = pd.read_csv('/content/drive/MyDrive/NLP/csvfiles/TwitterDevDataSrc.csv')
redditDevDf    = pd.read_csv('/content/drive/MyDrive/NLP/csvfiles/RedditDevDataSrc.csv')

twitterTestDf  = pd.read_csv('/content/drive/MyDrive/NLP/csvfiles/TwitterTestDataSrc.csv')
redditTestDf   = pd.read_csv('/content/drive/MyDrive/NLP/csvfiles/RedditTestDataSrc.csv')

trainDf = processStanceData(twitterTrainDf, redditTrainDf)
trainDf

Unnamed: 0,replyText,replyTextId,previousText,sourceText,label,previousPlusSrcText,labelValue,pReplyText,pPreviousPlusSrcText,TextSrcInre
0,France: 10 people dead after shooting at HQ of...,552783667052167168.0,,,support,,0.0,France: 10 people dead after shooting at HQ of...,,France: 10 people dead after shooting at HQ of...
1,BREAKING: 10 reportedly shot dead at Paris HQ ...,552785375161499648.0,,,support,,0.0,BREAKING: 10 reportedly shot dead at Paris HQ ...,,BREAKING: 10 reportedly shot dead at Paris HQ ...
2,BREAKING: At least 10 killed in shooting at Fr...,552791196247269376.0,,,support,,0.0,BREAKING: At least 10 killed in shooting at Fr...,,BREAKING: At least 10 killed in shooting at Fr...
3,Eleven dead in shooting at Paris offices of sa...,552791578893619200.0,,,support,,0.0,Eleven dead in shooting at Paris offices of sa...,,Eleven dead in shooting at Paris offices of sa...
4,BREAKING Charlie Hebdo latest: 11 dead 10 woun...,552792544132997120.0,,,support,,0.0,BREAKING Charlie Hebdo latest: 11 dead 10 woun...,,BREAKING Charlie Hebdo latest: 11 dead 10 woun...
...,...,...,...,...,...,...,...,...,...,...
693,"&gt; That isn't disputed, it's that he came to...",e2bmzf2,e2bmcct,8yktu5,comment,"That isn't disputed, it's that he came to Scot...",3.0,"&gt; That isn't disputed, it's that he came to...","That isn't disputed, it's that he came to Scot...","&gt; That isn't disputed, it's that he came to..."
694,In the press conference earlier today he said ...,e2bos7p,e2bo412,8yktu5,deny,There is no point to any of this. Jon ...,1.0,In the press conference earlier today he said ...,There is no point to any of this. Jon ...,In the press conference earlier today he said ...
695,[deleted],e2btp0f,e2bta92,8yktu5,comment,Lol it doesn't though Jon Sopel: Bizarre. @rea...,3.0,[deleted],Lol it doesn't though Jon Sopel: Bizarre. $MEN...,[deleted] Lol it doesn't though Jon Sopel: Biz...
696,[deleted],e2bxvw0,e2bxjzw,8yktu5,comment,"It's just another of the many ""factual errors""...",3.0,[deleted],"It's just another of the many ""factual errors""...","[deleted] It's just another of the many ""factu..."


In [None]:
devDf = processStanceData(twitterDevDf, redditDevDf)
devDf

Unnamed: 0,replyText,replyTextId,previousText,sourceText,label,previousPlusSrcText,labelValue,pReplyText,pPreviousPlusSrcText,TextSrcInre
0,Appalled by the attack on Charlie Hebdo in Par...,552788945017516032,,,support,,0,Appalled by the attack on Charlie Hebdo in Par...,,Appalled by the attack on Charlie Hebdo in Par...
1,Reports of fatality and injuries following sho...,553480082996879360,,,support,,0,Reports of fatality and injuries following sho...,,Reports of fatality and injuries following sho...
2,#BREAKING Paris hostage-taker 'knows' one Char...,553553288625672192,,,support,,0,#BREAKING Paris hostage-taker 'knows' one Char...,,#BREAKING Paris hostage-taker 'knows' one Char...
3,BREAKING: Police order all shops closed in fam...,553561170637238272,,,support,,0,BREAKING: Police order all shops closed in fam...,,BREAKING: Police order all shops closed in fam...
4,@UnbiasedF If you go into such facts it will b...,552797821188206592,552796424266854400,552788945017516032.0,comment,@m33ryg @tnewtondunn @mehdirhasan Can you supp...,3,$MENTION$ If you go into such facts it will be...,$MENTION$ $MENTION$ $MENTION$ Can you supply t...,$MENTION$ If you go into such facts it will be...
...,...,...,...,...,...,...,...,...,...,...
431,Remember the fluorides in our water replaces w...,e3bk3q8,934q6t,934q6t,comment,Iodine increases IQ and is an essential part o...,3,Remember the fluorides in our water replaces w...,Iodine increases IQ and is an essential part o...,Remember the fluorides in our water replaces w...
432,"We evolved as omnivores, but I do agree with y...",e3ay5vh,e3ak72f,934q6t,comment,Oh they put iodine in salt here in Latin Ameri...,3,"We evolved as omnivores, but I do agree with y...",Oh they put iodine in salt here in Latin Ameri...,"We evolved as omnivores, but I do agree with y..."
433,&gt; 7 drops in about a double shot of cold wa...,e3c5joo,e3bqum8,934q6t,comment,I do about 7 drops in about a double shot of c...,3,&gt; 7 drops in about a double shot of cold wa...,I do about 7 drops in about a double shot of c...,&gt; 7 drops in about a double shot of cold wa...
434,The book The Iodine Crisis is really good and ...,e3bq78h,934q6t,934q6t,support,Iodine increases IQ and is an essential part o...,0,The book The Iodine Crisis is really good and ...,Iodine increases IQ and is an essential part o...,The book The Iodine Crisis is really good and ...


In [None]:

testDf = processStanceData(twitterTestDf, redditTestDf)
testDf

Unnamed: 0,replyText,replyTextId,previousText,sourceText,label,previousPlusSrcText,labelValue,pReplyText,pPreviousPlusSrcText,TextSrcInre
0,"""National Geographic channel has paid $ 1 mill...",934715071757819904,,,support,,0,"""National Geographic channel has paid $ 1 mill...",,"""National Geographic channel has paid $ 1 mill..."
1,"""@KenyanTraffic: ""National Geographic channel ...",934828842505723904,,,support,,0,"""$MENTION$: ""National Geographic channel has p...",,"""$MENTION$: ""National Geographic channel has p..."
2,National Geographic channel has paid $ 1 mil...,941305217454403584,,,support,,0,National Geographic channel has paid $ 1 mil...,,National Geographic channel has paid $ 1 mil...
3,National Geographic channel has reportedly pai...,944339600998326274,,,support,,0,National Geographic channel has reportedly pai...,,National Geographic channel has reportedly pai...
4,@KenyanTraffic @LalitKModi @Gidi_Traffic Waoh....,934747500526874624,934715071757819904,934715071757819904.0,comment,"""National Geographic channel has paid $ 1 mill...",3,$MENTION$ $MENTION$ $MENTION$_Traffic Waoh...I...,"""National Geographic channel has paid $ 1 mill...",$MENTION$ $MENTION$ $MENTION$_Traffic Waoh...I...
...,...,...,...,...,...,...,...,...,...,...
756,Sometimes you can't win an argument. :-)\n\nAm...,c5nsrhe,xn2bn,xn2bn,comment,"I've been searching, and can't find a single c...",3,Sometimes you can't win an argument. :-)\n\nAm...,"I've been searching, and can't find a single c...",Sometimes you can't win an argument. :-)\n\nAm...
757,"I'm not a troll.. I figured it was bullshit, b...",c5nsqr2,c5nspru,xn2bn,comment,Just in case this isn't just a troll\n\n- Obam...,3,"I'm not a troll.. I figured it was bullshit, b...",Just in case this isn't just a troll\n\n- Obam...,"I'm not a troll.. I figured it was bullshit, b..."
758,Just in case this isn't just a troll\n\n- Obam...,c5nspru,xn2bn,xn2bn,comment,"I've been searching, and can't find a single c...",3,Just in case this isn't just a troll\n\n- Obam...,"I've been searching, and can't find a single c...",Just in case this isn't just a troll\n\n- Obam...
759,Right-wing republicans? Aren't a lot of them h...,c5nsn66,xn2bn,xn2bn,comment,"I've been searching, and can't find a single c...",3,Right-wing republicans? Aren't a lot of them h...,"I've been searching, and can't find a single c...",Right-wing republicans? Aren't a lot of them h...


In [None]:
x_train = trainDf['TextSrcInre'].tolist()
y_train = trainDf['labelValue'].tolist()


x_dev  = devDf['TextSrcInre'].tolist()
y_dev  = devDf['labelValue'].tolist()
x_test = testDf['TextSrcInre'].tolist()
y_test = testDf['labelValue'].tolist()

#Instantiating TfidfVectorizer object and fitting it on the training set
tfidf         = TfidfVectorizer(min_df = 10, max_df = 0.5, ngram_range=(1,2))
x_train_feats = tfidf.fit_transform(x_train)

print(x_train_feats)


  (0, 54)	0.2961078206082725
  (0, 1398)	0.25769442938989356
  (0, 905)	0.282676917693621
  (0, 211)	0.2961078206082725
  (0, 1756)	0.21630594560940444
  (0, 2360)	0.2633683425719235
  (0, 2108)	0.08652395767949313
  (0, 53)	0.2961078206082725
  (0, 412)	0.10384043347351882
  (0, 1329)	0.2386008839179131
  (0, 2297)	0.3044715492663311
  (0, 1693)	0.17570591896097137
  (0, 1376)	0.08723161600710759
  (0, 904)	0.282676917693621
  (0, 208)	0.10735377481694162
  (0, 1755)	0.1623568901334913
  (0, 71)	0.172998989831333
  (0, 495)	0.15380913270448754
  (0, 1497)	0.1424132199018204
  (0, 2)	0.1939621385872572
  (0, 696)	0.19705999212582448
  (1, 411)	0.1954324065806648
  (1, 216)	0.25561946290428395
  (1, 1765)	0.3112354312987481
  (1, 836)	0.19509907912459648
  :	:
  (1769, 491)	0.22264309407670407
  (1769, 1013)	0.07247036096112647
  (1769, 1045)	0.07867501913124181
  (1769, 1029)	0.037097198708640255
  (1769, 795)	0.14446994410820022
  (1769, 349)	0.049089211775819604
  (1769, 1942)	0.0741

In [None]:
x_train_feats

<1770x2426 sparse matrix of type '<class 'numpy.float64'>'
	with 84077 stored elements in Compressed Sparse Row format>

In [None]:
trainDf['TextSrcInre'][0].tolist()

['France: 10 people dead after shooting at HQ of satirical weekly newspaper #CharlieHebdo, according to witnesses $URL$ ',
 "Even ants won't eat aspartame! "]

In [None]:
trainDf['labelValue'][0].tolist()

[0.0, 0.0]

In [None]:
from sklearn import svm

In [None]:
clf = svm( kernel='rbf')
clf.fit(x_train_feats, y_train)

TypeError: ignored

In [None]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score


In [None]:
Naive = naive_bayes.MultinomialNB()
Naive.fit(x_train_feats, y_train)

ValueError: ignored

In [None]:
x_train = trainDf['TextSrcInre'].tolist()
y_train = trainDf['labelValue'].tolist()


x_dev  = devDf['TextSrcInre'].tolist()
y_dev  = devDf['labelValue'].tolist()
x_test = testDf['TextSrcInre'].tolist()
y_test = testDf['labelValue'].tolist()

In [None]:
tf_vectorizer=TfidfVectorizer(ngram_range=(1,3))
x_train_tfidf=tf_vectorizer.fit_transform(trainDf)
#x_test_tfidf=tf_vectorizer.transform(x_test)

In [None]:
clf = svm.LinearSVC()
clf.fit(x_train_tfidf, y_train)

ValueError: ignored

In [None]:
x_train_tfidf

<10x10 sparse matrix of type '<class 'numpy.float64'>'
	with 10 stored elements in Compressed Sparse Row format>