In [12]:
# The following code reads the input data set and creates a train and test data set from the all records

import pandas as pd
import numpy as np

# column names for dataframe
cols = ['sentiment','id','date','query_string','user','text']
def main():
    # read training data
    df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding = 'ISO-8859-1',names=cols)
    # shuffling the records
    df = df.sample(frac=1).reset_index(drop=True)
    # splitting the original dataset into train and test randomly in 99:1 ratio
    np.random.seed(777)
    msk = np.random.rand(len(df)) < 0.99
    train = df[msk].reset_index(drop=True)
    train= train[:300000]# using only 300000 rows for training
    test = df[~msk].reset_index(drop=True)
    # save both train and test as CSV files
    train.to_csv('train_data.csv')
    print(train.shape)
    test.to_csv('test_data.csv')

if __name__=="__main__":
	main()

(300000, 6)


In [13]:

#import libraries
import sys
import findspark
findspark.init("/home/aritra/spark/spark-2.4.0-bin-hadoop2.7")
#import pyspark as ps
import warnings
import re
from pyspark.sql import functions as f
from pyspark.sql import types as t
from pyspark.sql.types import StringType
from pyspark.ml.feature import Tokenizer, NGram, CountVectorizer, IDF, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import PipelineModel
import os
import findspark
from pyspark import SparkConf, SparkContext
from pyspark.sql import Row, SQLContext
from pyspark.sql import functions as fun
from pyspark.sql import types as t
from pyspark.sql.types import StringType
cwd = os.getcwd()

# specify the parameters for saving and retreiving the model in disk 
inputdir = cwd
outputfile = "/home/aritra/CS 631/Project/Birendra/out"
modeldir = "/home/aritra/CS 631/Project/Birendra/model"

# Regex pattern for pre-processing
pat1 = r'@[A-Za-z0-9_]+'
pat2 = r'https?://[^ ]+'
combined_pat = r'|'.join((pat1,pat2))
www_pat = r'www.[^ ]+'
negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}
neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')

# preprocessing codes
# remove Twitter handle and URL, remove URL pattern starting with www., and transform to lower characters and remove numbers and special characters

def pre_processing(column):
    step1 = re.sub(combined_pat, '', column)
    step2 = re.sub(www_pat, '', step1)
    step3 = step2.lower()
    step4 = neg_pattern.sub(lambda x: negations_dic[x.group()], step3)
    final = re.sub(r'[^A-Za-z ]','',step4)
    return final.strip()

# building a pipeline following below order
# tokenizer + create n-gram + count vceorizer + inverse doc freq + assembler+  encoding target labels

def build_pipeline():
    tokenizer = [Tokenizer(inputCol='tweet',outputCol='words')]
    ngrams = [NGram(n=i, inputCol='words', outputCol='{0}_grams'.format(i)) for i in range(1,4)]
    cv = [CountVectorizer(vocabSize=5460, inputCol='{0}_grams'.format(i), outputCol='{0}_tf'.format(i)) for i in range(1,4)]
    idf = [IDF(inputCol='{0}_tf'.format(i), outputCol='{0}_tfidf'.format(i), minDocFreq=5) for i in range(1,4)]
    assembler = [VectorAssembler(inputCols=['{0}_tfidf'.format(i) for i in range(1,4)], outputCol='features')]
    label = [StringIndexer(inputCol='sentiment', outputCol='label')]
    lr = [LogisticRegression(maxIter=100)] # lr: train a logistic regression model
    pipeline = Pipeline(stages=tokenizer+ngrams+cv+idf+assembler+label+lr)
    return pipeline

# below function for training model or using model for classification from trained and saved model

def main(sqlc,input_dir,loaded_model=None):
    print('reading data from {}'.format(input_dir))
    if not loaded_model:
        train_set = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('train_data.csv')
    test_set = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('test_data.csv')
    print('preprocessing data...')
    classify_pp = fun.udf(pre_processing, t.StringType())
    if not loaded_model:
        train_set = train_set.withColumn('tweet', classify_pp(f.col('text')))
    test_set = test_set.withColumn('tweet', classify_pp(f.col('text')))
    if not loaded_model:
        pipeline = build_pipeline()
        print('training...')
        model = pipeline.fit(train_set)
    else:
        model = loaded_model
    print('making predictions on test data...')
    predictions = model.transform(test_set)
    accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(test_set.count())
    return model, predictions, accuracy


if __name__=="__main__":
    # create a SparkContext while checking if there is already SparkContext created
    try:
        sc = SparkContext(appName="YourTest", master="local[2]")
        sc.setLogLevel("ERROR")
        sqlContext = SQLContext(sc)
        print('Created a SparkContext')
    except ValueError:
        warnings.warn('SparkContext already exists in this scope')
    # fitting the model after preprocessing
    pipelineFit, predictions, accuracy = main(sqlContext,inputdir)
    print('test data accuracy {}'.format(accuracy))
    # saving model predictions
    print('predictions saved to {}'.format(outputfile))
    # saving the trained model to defined location
    print('saving model to {}'.format(modeldir))
    pipelineFit.save(modeldir)
    # Testing model predictions
    loadedModel = PipelineModel.load(modeldir)
    _, _, loaded_accuracy = main(sqlContext,inputdir,loadedModel)
    print('accuracy of the model is {}'.format(loaded_accuracy))



reading data from /home/aritra/CS 631/Project/Birendra
preprocessing data...
training...
making predictions on test data...
test data accuracy 0.7934877350267713
predictions saved to /home/aritra/CS 631/Project/Birendra/out
saving model to /home/aritra/CS 631/Project/Birendra/model
reading data from /home/aritra/CS 631/Project/Birendra
preprocessing data...
making predictions on test data...
accuracy of the model is 0.7934877350267713


AttributeError: 'PipelineModel' object has no attribute 'coefficientMatrix'