In [1]:
import pyspark
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.mllib import *
from pyspark.sql import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.feature import Word2Vec
import pyspark.sql.functions as f
import pandas as pd
from pyspark.mllib.classification import LogisticRegressionWithSGD,LogisticRegressionWithLBFGS, LogisticRegressionModel
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import Vector as MLLibVector, Vectors as MLLibVectors
import pickle

In [2]:
spark = SparkSession \
        .builder \
        .appName("My Demo") \
        .config("spark.some.config.option", "some-value") \
        .config("spark.executor.memory", "8g") \
        .config("spark.driver.memory", "8g") \
        .getOrCreate()

sc = spark.sparkContext

In [3]:
# LET THERE BE TRAINING DATA
train_csv = pd.read_csv("/Users/anand/Downloads/UB/Sem2/DIc/A3/dic487-587/train.csv")
# LET THERE BE Test DATA
test_csv = pd.read_csv("/Users/anand/Downloads/UB/Sem2/DIc/A3/dic487-587/test.csv")
# LET THERE BE Mapping
map_csv = pd.read_csv("/Users/anand/Downloads/UB/Sem2/DIc/A3/dic487-587/mapping.csv")

In [4]:
# LET THERE BE DATAFRAME
spark.conf.set("spark.sql.execution.arrow.enable", "true")
train_set = spark.createDataFrame(train_csv)
test_set = spark.createDataFrame(test_csv)
mapping = spark.createDataFrame(map_csv)
count = mapping.count()

In [5]:
# One Hot Encoding
mg = mapping.select("0","Unnamed: 0").rdd.collectAsMap()

from pyspark.sql.types import *

def label_to_id(g):
    x=[]
    print(g)
    for i in g[1:-1].split(","):
        x.append(mg.get(i.strip()[1:-1]))
    x.sort()
    return x
udff=f.udf(label_to_id,ArrayType(IntegerType()))

train_set = train_set.withColumn("lab",udff("genre"))

def one_hot_list(g):
    y=1
    x=[]
    for i in range(0,count):
        x.append(0)
    for i in g:
        x[i]=1 
    return x

udff=f.udf(one_hot_list,ArrayType(IntegerType()))

traindata=train_set.withColumn("label",udff("lab"))

#traindata.show()

+--------+--------------------+--------------------+--------------------+----------------+--------------------+
|movie_id|          movie_name|                plot|               genre|             lab|               label|
+--------+--------------------+--------------------+--------------------+----------------+--------------------+
|23890098|          Taxi Blues|Shlykov, a hard-w...|['World cinema', ...|          [0, 5]|[1, 0, 0, 0, 0, 1...|
|31186339|    The Hunger Games|The nation of Pan...|['Action/Adventur...|  [0, 4, 10, 17]|[1, 0, 0, 0, 1, 0...|
|20663735|          Narasimham|Poovalli Induchoo...|['Musical', 'Acti...|      [0, 4, 16]|[1, 0, 0, 0, 1, 0...|
| 2231378|  The Lemon Drop Kid|The Lemon Drop Ki...|          ['Comedy']|             [1]|[0, 1, 0, 0, 0, 0...|
|  595909|   A Cry in the Dark|Seventh-day Adven...|['Crime Fiction',...|       [0, 5, 6]|[1, 0, 0, 0, 0, 1...|
| 5272176|            End Game|The president is ...|['Action/Adventur...|   [0, 3, 4, 10]|[1, 0, 0, 1, 1

In [6]:
regexTokenizer = RegexTokenizer(inputCol="plot", outputCol="words", pattern="\\W")

In [7]:
# stopwordsRemoverList = ['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any',
#                  'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below',
#                  'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't",
#                  'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few',
#                  'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven',
#                  "haven't", 'having', 'he', 'her', 'here','hers', 'herself', 'him', 'himself', 'his', 'how',
#                  'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it's", 'its', 'itself', 'just', 'll',
#                  'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 
#                  'needn',"needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or',
#                  'other','our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't",
#                  'she', "she's", 'should', "should've", 'shouldn', "shouldn't", 'so', 'some', 'such', 't', 'than',
#                  'that', "that'll", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these',
#                  'they', 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 've', 'very', 'was',
#                  'wasn', "wasn't", 'we', 'were', 'weren', "weren't", 'what', 'when', 'where', 'which', 'while',
#                  'who', 'whom', 'why', 'will', 'with', 'won', "won't", 'wouldn', "wouldn't", 'y', 'you', "you'd",
#                  "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves']

In [13]:
stopwordsRemoverList = StopWordsRemover(inputCol="words", outputCol="filtered").loadDefaultStopWords("english")
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(stopwordsRemoverList)

In [14]:
word2vec = Word2Vec(vectorSize=300, minCount=0,inputCol='filtered', outputCol='word2vec_features')

In [15]:
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, word2vec])
fitted_pipeline_train = pipeline.fit(train_set)
print("Training data done!")
fitted_pipeline_test = pipeline.fit(test_set)
print("Test data done")

Training data done!
Test data done


In [16]:
ready_for_train = fitted_pipeline_train.transform(traindata)
ready_for_test = fitted_pipeline_test.transform(test_set)

In [17]:

x=[]
for i in range(0,count):
    def parsePoint(line):
        return LabeledPoint(line.label[i], MLLibVectors.fromML(line.word2vec_features))
    parsedData = ready_for_train.rdd.map(parsePoint)
    model = LogisticRegressionWithLBFGS.train(parsedData)
    with open('task3model'+str(i)+'.pkl','wb') as fid:
        pickle.dump(model, fid)
    labelsAndPreds = ready_for_test.rdd.map(lambda p: 
                                            (p.movie_id, model.predict(MLLibVectors.fromML(p.word2vec_features))))
    x.append(labelsAndPreds.collect())
    #print(f"Done for count: {i}")

In [18]:
# Exporting the dataset
import csv
from csv import reader,writer
import pandas as pd
column_name = ['movie_id', 'predictions']
data = pd.DataFrame(columns = column_name)

#with open('out1.csv','a') as writeobj:
    #csv_writer= writer(write_obj)
preds=[]
ids = []
for j in range(0,len(x[0])):
    preds=[]
    ids.append(str(x[0][j][0]))
    for i in range(0,count):
        preds.append(str(x[i][j][1]))
    temp = list(map(int, preds))
    #print(temp)
    temp2 =  ' '.join(map(str, temp))
    #print(int(ids[j]))
    temp = pd.DataFrame([[ids[j], temp2]], columns = column_name)
    data = data.append(temp)

data.to_csv('/Users/anand/Downloads/UB/Sem2/DIc/A3/result713.csv') 