In [4]:
# #import findspark

# import pyspark
# from pyspark.sql import SparkSession, Row
# from pyspark.sql.functions import udf, col, lower, regexp_replace, monotonically_increasing_id, split
# from pyspark.ml.feature import CountVectorizer, Tokenizer, StopWordsRemover, HashingTF, IDF, OneHotEncoderEstimator, StringIndexer, VectorAssembler, Binarizer
# from pyspark.mllib.regression import LabeledPoint
# from pyspark.mllib.classification import LogisticRegressionWithSGD

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# Part 3

In [None]:
# new spark session, getting data, creating spark dataframe


# You may be able to figure something out better
spark = SparkSession.builder.master("local[*]")\
        .config("spark.executor.memory", "32g")\
        .config("spark.driver.memory", "32g")\
        .config("spark.memory.offHeap.enabled",'true')\
        .config("spark.memory.offHeap.size","32g")\
        .getOrCreate()

spark2 = SparkSession.builder.master("local[*]")\
        .config("spark.executor.memory", "32g")\
        .config("spark.driver.memory", "32g")\
        .config("spark.memory.offHeap.enabled",'true')\
        .config("spark.memory.offHeap.size","32g")\
        .getOrCreate()

x = pd.read_csv('train_features.csv')
df = spark.createDataFrame(x)

df.printSchema()
df.show

# Cleaning plot data

In [None]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline

#Drop irrelevant features
drop_list = ['movie_id', 'movie_name']
data = df.select([column for column in df.columns if column not in drop_list])


# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="plot", outputCol="words", pattern="\\W")
# stop words
add_stopwords = ["http","https","amp","rt","t","c","the"] # standard stop words
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)
# bag of words count
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)
label_stringIdx = StringIndexer(inputCol = "genre", outputCol = "label")

# Fit the pipeline to training documents.
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
dataset.show(5)

In [None]:
#70 - 30 split

(train, test) = dataset.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

# Custom Feature Engineering

In [None]:
from pyspark.ml.feature import Word2Vec
from pyspark.ml import Pipeline
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover

drop_list = ['movie_id', 'movie_name']
data = df.select([column for column in df.columns if column not in drop_list])
regexTokenizer = RegexTokenizer(inputCol="plot", outputCol="words", pattern="\\W")
add_stopwords = ["http","https","amp","rt","t","c","the"] # standard stop words
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="tokens").setStopWords(add_stopwords)
w2v = Word2Vec(vectorSize=300, minCount=0, inputCol="tokens", outputCol="features")
label_stringIdx = StringIndexer(inputCol = "genre", outputCol = "label")

doc2vec_pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, w2v, label_stringIdx])
doc2vec_model = doc2vec_pipeline.fit(data)
doc2vecs_df = doc2vec_model.transform(data)

doc2vecs_df.show()

In [None]:
(w2v_train_df, w2v_test_df) = doc2vecs_df.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(w2v_train_df.count()))
print("Test Dataset Count: " + str(w2v_test_df.count()))

In [None]:
#RANDOM FOREST with Doc2Vec

from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics


rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)

# Train model with Training Data
rfModel = rf.fit(w2v_train_df)
predictions = rfModel.transform(w2v_test_df)
results = predictions.select(['prediction', 'label'])
predictionAndLabels=results.rdd
metrics = MulticlassMetrics(predictionAndLabels)

cm = metrics.confusionMatrix().toArray()
accuracy=(cm[0][0]+cm[1][1])/cm.sum()
precision=(cm[0][0])/(cm[0][0]+cm[1][0])
recall=(cm[0][0])/(cm[0][0]+cm[0][1])
f1score = 2*(precision*recall)/(precision+recall)

print("RandomForestClassifier accuracy: ",accuracy)
print("RandomForestClassifier precision: ",precision)
print("RandomForestClassifier recall: ", recall)
print("RandomForestClassifier f1score: ", f1score)

In [2]:
from sklearn.ensemble import RandomForestClassifier

In [17]:
x = pd.read_csv('train_features.csv')
y= pd.read_csv('train_labels.csv')
X_train = x.iloc[:, 1:]
y_train = y.iloc[1:,1:]
clf = RandomForestClassifier(max_depth=5, random_state=0)
clf.fit(X_train, y_train)



  


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [18]:
x_test = pd.read_csv('test_features.csv')
X_tester = x_test.iloc[:, 1:]

In [19]:
X_tester

Unnamed: 0,-214410031.731486,6156482.839096,-40009.082892,-106798.8679,-22075.865011,-131.213147,-42677.883959,-1345.72629,-2139.74082,1252.08256,...,-0.004107,-0.002644,0.000573,0.004455,0.000742,0.000251,0.010003,-0.002461,7e-06,-0.001208
0,8.887227e+08,-5.697653e+06,-3.246350e+05,-1.458124e+04,3.360443e+04,-6616.477948,-62974.019123,5014.299745,-5103.807233,2853.937468,...,0.004792,-0.005321,0.006356,0.020417,0.003568,0.003092,0.004354,0.000841,0.006413,-0.002123
1,-1.834045e+09,2.012896e+06,-5.757508e+04,-7.745355e+04,4.855096e+04,3423.537938,-22131.213006,462.895476,2560.000549,-2431.303783,...,-0.009077,0.004042,-0.002609,-0.008480,-0.002297,0.000568,-0.002650,0.001871,0.004263,-0.009482
2,-6.040876e+08,1.452041e+06,-1.232778e+05,-6.675106e+04,3.544916e+04,1439.544188,-13681.724960,-2333.485201,-1307.836187,580.551951,...,0.000050,-0.001723,-0.003022,-0.002693,0.001505,-0.000686,-0.002552,-0.003826,-0.005397,-0.003756
3,5.274843e+07,-4.289884e+06,-2.555023e+05,-3.156241e+04,4.914208e+04,358.703139,25320.387579,-5583.576500,-2432.435069,1759.132287,...,-0.005462,-0.004288,-0.000728,0.001797,0.002756,0.001359,0.004625,-0.002706,0.003057,-0.002346
4,-5.457172e+08,-5.674806e+06,1.161370e+06,1.208870e+06,-1.457188e+06,49643.679339,-28738.282607,-1424.025959,-907.633609,3950.890147,...,0.004040,0.004015,-0.012465,0.003305,0.007033,0.015414,-0.001974,-0.000705,-0.009932,-0.005075
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,1.951889e+09,-3.505618e+06,-3.363164e+05,-4.928346e+04,-4.999543e+03,-4248.325622,-15854.509180,-3654.574071,-2455.870956,1869.683551,...,0.005353,-0.002749,-0.002504,-0.000439,0.007173,0.003035,0.000376,-0.002145,0.001481,-0.005839
391,-1.817505e+09,1.287522e+06,-7.595760e+04,-1.031154e+05,1.630942e+04,11694.002880,145353.031200,11890.845010,280.648815,-1890.998535,...,0.005877,0.000527,-0.003879,-0.003899,0.007070,-0.000054,0.006222,-0.006390,-0.000011,-0.001941
392,-1.365482e+09,9.631980e+05,-1.015827e+05,-8.411405e+04,3.081189e+04,2342.783845,-20617.293440,-3844.749274,1112.927143,-875.361105,...,-0.000641,0.000945,0.001797,0.002050,0.001812,-0.000501,0.001465,0.001771,0.005424,-0.004140
393,-1.683890e+09,-1.339837e+06,-1.314145e+05,-7.987772e+04,2.784978e+04,5843.349040,62609.859780,-231.248927,1437.334083,-1470.524014,...,0.001491,0.002567,-0.002293,0.003350,-0.004128,-0.001047,-0.001335,0.001242,-0.000945,0.005124


In [20]:
y_pred = clf.predict(X_tester)

In [21]:
y_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,