In [1]:
import findspark
findspark.init()
from pyspark import SparkContext,SparkConf
from pyspark.sql import SparkSession,SQLContext
import sys
import time
import pandas as pd

In [2]:
master = "local"
if len(sys.argv)==2:
    master = sys.argv[1]
spark = SparkSession.builder.appName("UDF").master(master).getOrCreate()
spark

In [21]:
sc = spark.sparkContext
sc

In [22]:
file_path = "../data/movies.csv"

In [23]:
# file_data = spark.read.csv(file_path,sep=",")
file_data = spark.read.format("csv").option("header","true").load(file_path) # 使用第一行作为列名
file_data

DataFrame[movieId: string, title: string, genres: string]

#### df.head()和df.show()的区别

In [24]:
file_data.head(2)

[Row(movieId='1', title='Toy Story (1995)', genres='Adventure|Animation|Children|Comedy|Fantasy'),
 Row(movieId='2', title='Jumanji (1995)', genres='Adventure|Children|Fantasy')]

In [25]:
file_data.show(2)

+-------+----------------+--------------------+
|movieId|           title|              genres|
+-------+----------------+--------------------+
|      1|Toy Story (1995)|Adventure|Animati...|
|      2|  Jumanji (1995)|Adventure|Childre...|
+-------+----------------+--------------------+
only showing top 2 rows



In [26]:
df_movies = file_data

In [27]:
# 创建视图
df_movies.createOrReplaceTempView("movies")

### 使用sparksession创建出udf函数，进行查找电影类型名称的长度

In [28]:
from pyspark.sql.types import IntegerType
# 注册自定义函数
spark.udf.register("fun1",lambda x:len(x))
spark.sql("select genres,fun1(genres) from movies limit 5").show()

+--------------------+------------+
|              genres|fun1(genres)|
+--------------------+------------+
|Adventure|Animati...|          43|
|Adventure|Childre...|          26|
|      Comedy|Romance|          14|
|Comedy|Drama|Romance|          20|
|              Comedy|           6|
+--------------------+------------+



In [29]:
# df_movies.describe(df_movies["movieId"]) # 会出错 TypeError: Column is not iterable
df_movies.describe("movieId").show()
df_movies.describe(["movieId"]).show()

+-------+------------------+
|summary|           movieId|
+-------+------------------+
|  count|              9125|
|   mean|31123.291835616437|
| stddev| 40782.63360397416|
|    min|                 1|
|    max|             99992|
+-------+------------------+

+-------+------------------+
|summary|           movieId|
+-------+------------------+
|  count|              9125|
|   mean|31123.291835616437|
| stddev| 40782.63360397416|
|    min|                 1|
|    max|             99992|
+-------+------------------+



### RDD DataFrame DataSet

## <font color="red">真就莫名其妙呗，在这里出错，结果重启一下服务再提交就可以了。。。- -</font>

In [30]:
# df转rdd
rdd_movies = df_movies.rdd.map(lambda x:"movieId is {},title is {},genres is {}".format(x.movieId,x.title,x.genres)).collect()

In [31]:
for i in rdd_movies:
    print(i)

movieId is 1,title is Toy Story (1995),genres is Adventure|Animation|Children|Comedy|Fantasy
movieId is 2,title is Jumanji (1995),genres is Adventure|Children|Fantasy
movieId is 3,title is Grumpier Old Men (1995),genres is Comedy|Romance
movieId is 4,title is Waiting to Exhale (1995),genres is Comedy|Drama|Romance
movieId is 5,title is Father of the Bride Part II (1995),genres is Comedy
movieId is 6,title is Heat (1995),genres is Action|Crime|Thriller
movieId is 7,title is Sabrina (1995),genres is Comedy|Romance
movieId is 8,title is Tom and Huck (1995),genres is Adventure|Children
movieId is 9,title is Sudden Death (1995),genres is Action
movieId is 10,title is GoldenEye (1995),genres is Action|Adventure|Thriller
movieId is 11,title is American President, The (1995),genres is Comedy|Drama|Romance
movieId is 12,title is Dracula: Dead and Loving It (1995),genres is Comedy|Horror
movieId is 13,title is Balto (1995),genres is Adventure|Animation|Children
movieId is 14,title is Nixon (1995

movieId is 3299,title is Hanging Up (2000),genres is Comedy|Drama
movieId is 3300,title is Pitch Black (2000),genres is Horror|Sci-Fi|Thriller
movieId is 3301,title is Whole Nine Yards, The (2000),genres is Comedy|Crime
movieId is 3302,title is Beautiful People (1999),genres is Comedy
movieId is 3303,title is Black Tar Heroin: The Dark End of the Street (2000),genres is Documentary
movieId is 3304,title is Blue Collar (1978),genres is Crime|Drama
movieId is 3306,title is Circus, The (1928),genres is Comedy
movieId is 3307,title is City Lights (1931),genres is Comedy|Drama|Romance
movieId is 3308,title is Flamingo Kid, The (1984),genres is Comedy|Drama
movieId is 3309,title is Dog's Life, A (1918),genres is Comedy
movieId is 3310,title is Kid, The (1921),genres is Comedy|Drama
movieId is 3313,title is Class Reunion (1982),genres is Comedy
movieId is 3314,title is Big Trees, The (1952),genres is Action|Drama
movieId is 3316,title is Reindeer Games (2000),genres is Action|Thriller
movieId

movieId is 8811,title is Yu-Gi-Oh! (2004),genres is Action|Adventure|Animation|Fantasy
movieId is 8813,title is We Don't Live Here Anymore (2004),genres is Drama
movieId is 8814,title is Without a Paddle (2004),genres is Comedy
movieId is 8819,title is Double Trouble (1967),genres is Musical
movieId is 8820,title is Spinout (1966),genres is Comedy|Musical
movieId is 8821,title is Harum Scarum (1965),genres is Comedy|Musical
movieId is 8823,title is Sting II, The (1983),genres is Comedy|Crime
movieId is 8827,title is Bill Cosby, Himself (1983),genres is Comedy|Documentary
movieId is 8828,title is Dead Ringer (1964),genres is Drama|Thriller
movieId is 8830,title is Anacondas: The Hunt for the Blood Orchid (2004),genres is Adventure|Drama|Horror|Sci-Fi|Thriller
movieId is 8831,title is Suspect Zero (2004),genres is Crime|Thriller
movieId is 8832,title is Warriors of Heaven and Earth (Tian di ying xiong) (2003),genres is Action|Adventure|Drama
movieId is 8833,title is Vanity Fair (2004),ge

movieId is 96821,title is Perks of Being a Wallflower, The (2012),genres is Drama|Romance
movieId is 96829,title is Hunt, The (Jagten) (2012),genres is Drama
movieId is 96832,title is Holy Motors (2012),genres is Drama|Fantasy|Musical|Mystery|Sci-Fi
movieId is 96849,title is Sparkle (2012),genres is Drama|Musical
movieId is 96861,title is Taken 2 (2012),genres is Action|Crime|Drama|Thriller
movieId is 96863,title is Paperboy, The (2012),genres is Thriller
movieId is 96901,title is Making of a Legend: Gone with the Wind, The (1988),genres is Documentary
movieId is 96911,title is Royal Affair, A (Kongelig affære, En) (2012),genres is Drama|Romance
movieId is 97057,title is Kon-Tiki (2012),genres is Adventure|Documentary|Drama
movieId is 97168,title is Marley (2012),genres is Documentary
movieId is 97188,title is Sinister (2012),genres is Horror|Thriller
movieId is 97225,title is Hotel Transylvania (2012),genres is Animation|Children|Comedy
movieId is 97230,title is Side by Side (2012),ge

In [32]:
type(rdd_movies)

list

In [33]:
df_movies.printSchema()

root
 |-- movieId: string (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



In [6]:
# 这两句连接mysql的语句是写错了的。在没有调用load函数的时候，语句运行正常。但是加上load函数之后，就直接报错。java.sql.SQLException: No suitable driver

# dt = spark.read.format("jdbc").options(url="jdbc:mysql://localhost:3306/mydb?user=root&password=123456",dbtable="userphone").load()

# dataframe_mysql=spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/mydb").option("dbtable", "userphone").option("user", "root").option("password", "123456").load()

In [22]:
dt = spark.read.format("jdbc").options(url="jdbc:mysql://localhost:3306/mydb",driver="com.mysql.jdbc.Driver",dbtable="userphone",user="root",password="123456")

In [23]:
jdbcDF = dt.load()

In [24]:
jdbcDF.printSchema()

root
 |-- id: integer (nullable = true)
 |-- phone: string (nullable = true)
 |-- trueName: string (nullable = true)



In [25]:
jdbcDF.show()

+---+-----------+--------+
| id|      phone|trueName|
+---+-----------+--------+
|  1|15733218050|    lili|
|  2|15778423030|    lisa|
|  3|18620192711|     tom|
|  4|15733218050|    john|
+---+-----------+--------+

