In [None]:
import pandas as pd
from pyspark.sql.types import StructType, StructField, FloatType, LongType, StringType, IntegerType
from  pyspark.sql.functions import input_file_name
from pyspark.sql import SQLContext, Row
from pyspark.sql.functions import col

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

df = spark.read.csv("hdfs://orion11:32001/final/sample.csv",header=True,sep="\t");

#hdfs://orion11:32001/final/sample.csv
## bigget sample: hdfs://orion11:32001/final/bigger_sample.csv
## all data:   hdfs://orion11:32001/nytimes_lead_paragraphs_Sep-1851-July-2017.csv

##movie data: hdfs://orion11:32001/final/title.basics.tsv

df = df.select(col("DATE").alias("date"), col("LEAD_PARAGRAPH").alias("text"))

df.show(n=5)

In [None]:
df.createOrReplaceTempView("TEMP_DF")
df2 = spark.sql("""select * from TEMP_DF WHERE temp_df.date LIKE '199%'""")
df2.show()

In [None]:
#when have date and text columns
#build sentimental analysis

from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

def sentiment_analyzer_scores(sentence):
    score = analyser.polarity_scores(sentence)
    return str(score)

pand = df2.toPandas()
pand['analysis'] = pand['text'].apply(sentiment_analyzer_scores)
pand = pand.drop(columns=['text'])

pand.iloc[:4]

In [None]:
#{'neg': 0.171, 'neu': 0.721, 'pos': 0.108, 'co..
#change columns to be able to count
#group by year
#count neg and pos for each year

In [None]:
import pandas as pd
import json 
import numpy as np
answer = []

for row in pand.itertuples():
    rows = row.analysis
    rows = rows.replace("'",'"') 
    data = json.loads(rows)
    columns = list(data.keys())
    values = list(data.values())
    arr_len = len(values)

    res = pd.DataFrame(np.array(values, dtype=object).reshape(1, arr_len), columns=columns)
    answer.append(res)
    
result = pd.concat(answer)

print(result.iloc[:4])

In [7]:
print(result.sum(axis = 0, skipna = True))

neg          36674.9510
neu         610741.9870
pos          51654.6480
compound     66411.9074
dtype: float64


In [1]:
dfMovies = spark.read.csv("hdfs://orion11:32001/final/title.basics.tsv",header=True,sep="\t");

dfMovies.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- titleType: string (nullable = true)
 |-- primaryTitle: string (nullable = true)
 |-- originalTitle: string (nullable = true)
 |-- isAdult: string (nullable = true)
 |-- startYear: string (nullable = true)
 |-- endYear: string (nullable = true)
 |-- runtimeMinutes: string (nullable = true)
 |-- genres: string (nullable = true)



In [17]:
dfMovies.createOrReplaceTempView("MOV_DF")
dfMovies2 = spark.sql("""select mov_df.genres, mov_df.startYear, mov_df.endYear from MOV_DF WHERE mov_df.startYear LIKE '20%' OR mov_df.endYear LIKE '20%'""")
dfMovies2.show()

+--------------------+---------+-------+
|              genres|startYear|endYear|
+--------------------+---------+-------+
|Comedy,Fantasy,Ro...|     2001|     \N|
|              Comedy|     1951|   2016|
|       Drama,Romance|     1952|   2009|
|       Drama,Romance|     1952|   2009|
|       Drama,Romance|     1956|   2010|
|    Animation,Family|     1960|   2004|
|    Animation,Family|     1960|   2004|
|Comedy,Family,Fan...|     1964|   2004|
|              Comedy|     1962|   2016|
|    Animation,Family|     1965|   2003|
|    Animation,Family|     1965|   2003|
|       Short,Western|     2019|     \N|
|       Short,Western|     2019|     \N|
|Family,Fantasy,Music|     1968|   2001|
|Family,Fantasy,Music|     1968|   2001|
|       Drama,Romance|     1968|   2013|
|Drama,Mystery,Rom...|     1970|   2011|
|        Drama,Sci-Fi|     1970|   2016|
|    Animation,Family|     1971|   2005|
|    Animation,Family|     1971|   2005|
+--------------------+---------+-------+
only showing top

In [18]:
dfMovies2.createOrReplaceTempView("MOV2_DF")
dfMovies2 = spark.sql("""select mov2_df.genres, mov2_df.startYear, mov2_df.endYear from MOV2_DF WHERE mov2_df.genres NOT LIKE '%\\N%'""")
dfMovies2.show()

+--------------------+---------+-------+
|              genres|startYear|endYear|
+--------------------+---------+-------+
|Comedy,Fantasy,Ro...|     2001|     \N|
|              Comedy|     1951|   2016|
|       Drama,Romance|     1952|   2009|
|       Drama,Romance|     1952|   2009|
|       Drama,Romance|     1956|   2010|
|    Animation,Family|     1960|   2004|
|    Animation,Family|     1960|   2004|
|Comedy,Family,Fan...|     1964|   2004|
|              Comedy|     1962|   2016|
|    Animation,Family|     1965|   2003|
|    Animation,Family|     1965|   2003|
|       Short,Western|     2019|     \N|
|       Short,Western|     2019|     \N|
|Family,Fantasy,Music|     1968|   2001|
|Family,Fantasy,Music|     1968|   2001|
|       Drama,Romance|     1968|   2013|
|Drama,Mystery,Rom...|     1970|   2011|
|        Drama,Sci-Fi|     1970|   2016|
|    Animation,Family|     1971|   2005|
|    Animation,Family|     1971|   2005|
+--------------------+---------+-------+
only showing top

In [19]:
from pyspark.sql.functions import split, explode
dfMovies3 = dfMovies2.withColumn('genres',explode(split('genres',',')))
dfMovies3.show()

+---------+---------+-------+
|   genres|startYear|endYear|
+---------+---------+-------+
|   Comedy|     2001|     \N|
|  Fantasy|     2001|     \N|
|  Romance|     2001|     \N|
|   Comedy|     1951|   2016|
|    Drama|     1952|   2009|
|  Romance|     1952|   2009|
|    Drama|     1952|   2009|
|  Romance|     1952|   2009|
|    Drama|     1956|   2010|
|  Romance|     1956|   2010|
|Animation|     1960|   2004|
|   Family|     1960|   2004|
|Animation|     1960|   2004|
|   Family|     1960|   2004|
|   Comedy|     1964|   2004|
|   Family|     1964|   2004|
|  Fantasy|     1964|   2004|
|   Comedy|     1962|   2016|
|Animation|     1965|   2003|
|   Family|     1965|   2003|
+---------+---------+-------+
only showing top 20 rows



In [20]:
dfMovies3.printSchema()

root
 |-- genres: string (nullable = true)
 |-- startYear: string (nullable = true)
 |-- endYear: string (nullable = true)



In [21]:
from pyspark.sql.functions import desc

dfMovies4 = dfMovies3.groupBy("genres").count().sort(desc("count"))

In [22]:
dfMovies4.show()

+-----------+-------+
|     genres|  count|
+-----------+-------+
|      Drama|1813285|
|     Comedy|1477256|
|      Short|1018883|
|Documentary| 698311|
|  Talk-Show| 648277|
|    Romance| 614164|
| Reality-TV| 486324|
|     Family| 394810|
|  Animation| 336641|
|      Music| 305194|
|     Action| 298191|
|      Crime| 276681|
|      Adult| 266926|
|  Adventure| 243437|
|  Game-Show| 215690|
|    Fantasy| 154796|
|      Sport| 149935|
|     Horror| 142330|
|    Mystery| 140207|
|   Thriller| 132135|
+-----------+-------+
only showing top 20 rows



In [23]:
import pyspark.sql.functions as F     

c = dfMovies4.agg(F.sum("count")).collect()[0][0]
print(c)

10169311


In [24]:
#def count_per(num):
 #   return num/c

#dfMovies5 = dfMovies4.withColumn('percentage', count_per(dfMovies4.count))

from pyspark.sql.functions import *
dfMovies4 = dfMovies4.withColumn('total', expr("count/10169311"))


dfMovies4.show()

+-----------+-------+--------------------+
|     genres|  count|               total|
+-----------+-------+--------------------+
|      Drama|1813285|  0.1783095236245602|
|     Comedy|1477256| 0.14526608538179234|
|      Short|1018883|  0.1001919402405925|
|Documentary| 698311| 0.06866846731307559|
|  Talk-Show| 648277| 0.06374836997314764|
|    Romance| 614164| 0.06039386542510107|
| Reality-TV| 486324|0.047822708932788074|
|     Family| 394810| 0.03882367251822665|
|  Animation| 336641| 0.03310361931108214|
|      Music| 305194|0.030011276083502608|
|     Action| 298191| 0.02932263552565164|
|      Crime| 276681|0.027207447977547348|
|      Adult| 266926| 0.02624818928244008|
|  Adventure| 243437|0.023938396613103877|
|  Game-Show| 215690| 0.02120989317762039|
|    Fantasy| 154796|0.015221876880351088|
|      Sport| 149935|0.014743870061600044|
|     Horror| 142330|0.013996031786224258|
|    Mystery| 140207|0.013787266413624285|
|   Thriller| 132135|0.012993505656381243|
+----------