In [1]:
!apt-get install openjdk-8-jdk-headless #jdk install
!wget -q http://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz #spark file
!tar -xf spark-3.0.0-bin-hadoop3.2.tgz
!pip install findspark

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libxtst6 openjdk-8-jre-headless
Suggested packages:
  openjdk-8-demo openjdk-8-source libnss-mdns fonts-dejavu-extra fonts-nanum fonts-ipafont-gothic
  fonts-ipafont-mincho fonts-wqy-microhei fonts-wqy-zenhei fonts-indic
The following NEW packages will be installed:
  libxtst6 openjdk-8-jdk-headless openjdk-8-jre-headless
0 upgraded, 3 newly installed, 0 to remove and 49 not upgraded.
Need to get 39.6 MB of archives.
After this operation, 144 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 libxtst6 amd64 2:1.2.3-1build4 [13.4 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 openjdk-8-jre-headless amd64 8u422-b05-1~22.04 [30.8 MB]
Get:3 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 openjdk-8-jdk-headless amd64 8u422-b05-1~22.04 [8,843 kB]
Fetched 39.

In [2]:
import os
import findspark

#환경변수에 path 지정
os.environ['JAVA_HOME'] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ['SPARK_HOME'] = '/content/spark-3.0.0-bin-hadoop3.2'

#spark의 경우 잘 찾지 못하는 경우가 있어 findspark를 이용
findspark.init()

In [3]:
from pyspark.sql import SparkSession, Row
from pyspark.sql import types as T
from pyspark.sql import window as W
from pyspark.sql import functions as F

In [4]:
spark = SparkSession.builder \
        .master("local") \
        .appName("Colab") \
        .getOrCreate()

In [5]:
spark

In [6]:
point_his = spark.read.parquet('./sample_data/point_his.parquet')
member_dup = spark.read.parquet('./sample_data/member_dup.parquet')
member = spark.read.parquet('./sample_data/member.parquet')
item_his = spark.read.parquet('./sample_data/item_his.parquet')
study_his = spark.read.parquet('./sample_data/study_his.parquet')
regdate = spark.read.parquet('./sample_data/regdate.parquet')

In [7]:
point_his.show(3)
member_dup.show(3)
member.show(3)
item_his.show(3)
study_his.show(3)
regdate.show(3)

+-----+-------+--------+-----+
|  idx|proc_ym|proc_ymd|point|
+-----+-------+--------+-----+
|96465| 202306|20230624| 1000|
|96465| 202306|20230624|  500|
|87940| 202304|20230405| 2000|
+-----+-------+--------+-----+
only showing top 3 rows

+-----+---+--------+-------+
|  idx|sex|  status|  grade|
+-----+---+--------+-------+
| 6884| 여|유료회원|초3학년|
| 6331| 남|유료회원|초3학년|
|69294| 남|유료회원|초5학년|
+-----+---+--------+-------+
only showing top 3 rows

+-----+---+--------+-------+
|  idx|sex|  status|  grade|
+-----+---+--------+-------+
|  100| 남|유료회원|초1학년|
| 1000| 여|유료회원|초5학년|
|10000| 여|유료회원|초6학년|
+-----+---+--------+-------+
only showing top 3 rows

+-----+-----+-------+--------+----------+--------------+-----+
|  idx|   lv|proc_ym|proc_ymd|  codename|   mascodename|price|
+-----+-----+-------+--------+----------+--------------+-----+
|53687|175.0| 202305|20230508|  액세서리|아바타파츠구분|   50|
|53687|175.0| 202305|20230508|상태메시지|  기타파츠구분|   50|
|20163|161.0| 202304|20230408|  액세서리|아바타파츠구분|   50|
+----

In [8]:
point_his.printSchema()
member_dup.printSchema()
member.printSchema()
item_his.printSchema()
study_his.printSchema()
regdate.printSchema()

root
 |-- idx: string (nullable = true)
 |-- proc_ym: string (nullable = true)
 |-- proc_ymd: string (nullable = true)
 |-- point: string (nullable = true)

root
 |-- idx: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- status: string (nullable = true)
 |-- grade: string (nullable = true)

root
 |-- idx: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- status: string (nullable = true)
 |-- grade: string (nullable = true)

root
 |-- idx: string (nullable = true)
 |-- lv: string (nullable = true)
 |-- proc_ym: string (nullable = true)
 |-- proc_ymd: string (nullable = true)
 |-- codename: string (nullable = true)
 |-- mascodename: string (nullable = true)
 |-- price: string (nullable = true)

root
 |-- idx: string (nullable = true)
 |-- proc_ym: string (nullable = true)
 |-- proc_ymd: string (nullable = true)
 |-- pointnm: string (nullable = true)

root
 |-- idx: string (nullable = true)
 |-- regdate: string (nullable = true)



### 1.point_his의 proc_ymd 컬럼의 날짜 표현 형식을 yyyy-mm-dd 형식이 되도록 바꿔주세요.

In [10]:
point_ymd = point_his.withColumn('proc_ymd', F.date_format(F.to_date('proc_ymd', 'yyyyMMdd'), 'yyyy-MM-dd'))
point_ymd.show(5)

+-----+-------+----------+-----+
|  idx|proc_ym|  proc_ymd|point|
+-----+-------+----------+-----+
|96465| 202306|2023-06-24| 1000|
|96465| 202306|2023-06-24|  500|
|87940| 202304|2023-04-05| 2000|
|87940| 202304|2023-04-05| 3500|
|87940| 202304|2023-04-05| 4000|
+-----+-------+----------+-----+
only showing top 5 rows

root
 |-- idx: string (nullable = true)
 |-- proc_ym: string (nullable = true)
 |-- proc_ymd: string (nullable = true)
 |-- point: string (nullable = true)



### 2. item_his 테이블에서 레벨이 null인 유저가 가장 많은 날짜를 구해주세요.

In [13]:
item = item_his.filter(F.col('lv').isNull())
item.show(5)

+-----+----+-------+--------+--------+--------------+-----+
|  idx|  lv|proc_ym|proc_ymd|codename|   mascodename|price|
+-----+----+-------+--------+--------+--------------+-----+
|92027|null| 202305|20230505|  코스튬|아바타파츠구분|  300|
|92131|null| 202305|20230507|  코스튬|아바타파츠구분|  300|
|91962|null| 202305|20230506|  코스튬|아바타파츠구분|  300|
|94835|null| 202306|20230605|  코스튬|아바타파츠구분|  300|
|94835|null| 202306|20230605|  코스튬|아바타파츠구분|  300|
+-----+----+-------+--------+--------+--------------+-----+
only showing top 5 rows



In [14]:
item = item.dropDuplicates(subset = ['idx', 'proc_ymd'])
item.show(5)

+-----+----+-------+--------+--------+--------------+-----+
|  idx|  lv|proc_ym|proc_ymd|codename|   mascodename|price|
+-----+----+-------+--------+--------+--------------+-----+
|96008|null| 202306|20230618|  코스튬|아바타파츠구분|  300|
|88591|null| 202304|20230407|  코스튬|아바타파츠구분|  300|
|91873|null| 202305|20230503|  코스튬|아바타파츠구분|  300|
|88072|null| 202304|20230404|  코스튬|아바타파츠구분|  300|
|93732|null| 202305|20230524|  코스튬|아바타파츠구분|  300|
+-----+----+-------+--------+--------+--------------+-----+
only showing top 5 rows



In [15]:
item.groupBy('proc_ymd').count().orderBy('count', ascending=False).show(1)

+--------+-----+
|proc_ymd|count|
+--------+-----+
|20230505|    7|
+--------+-----+
only showing top 1 row



### 3. regdate_his 등록일(reg_date)이 2023.03.15일인 유료회원들이 가장 많이 착용한 아이템(codename) TOP3를 뽑아주세요.

In [16]:
regdate.show(1)
member.show(1)
item_his.show(1)

+---+--------+
|idx| regdate|
+---+--------+
|  1|20221206|
+---+--------+
only showing top 1 row

+---+---+--------+-------+
|idx|sex|  status|  grade|
+---+---+--------+-------+
|100| 남|유료회원|초1학년|
+---+---+--------+-------+
only showing top 1 row

+-----+-----+-------+--------+--------+--------------+-----+
|  idx|   lv|proc_ym|proc_ymd|codename|   mascodename|price|
+-----+-----+-------+--------+--------+--------------+-----+
|53687|175.0| 202305|20230508|액세서리|아바타파츠구분|   50|
+-----+-----+-------+--------+--------+--------------+-----+
only showing top 1 row



In [19]:
join_data = member.join(regdate, member.idx == regdate.idx, 'inner') \
                  .filter((F.col('status') == '유료회원') & (F.col('regdate') == '20230315')) \
                  .select(member["idx"], "status", "regdate")
join_data.show(3)

+-----+--------+--------+
|  idx|  status| regdate|
+-----+--------+--------+
|84662|유료회원|20230315|
|84663|유료회원|20230315|
|84664|유료회원|20230315|
+-----+--------+--------+
only showing top 3 rows



In [20]:
reg_member_items = join_data.join(item_his, join_data["idx"] == item_his["idx"], "inner") \
                            .select(join_data["idx"], "status", "regdate", "codename")
reg_member_items.show(3)

+-----+--------+--------+----------+
|  idx|  status| regdate|  codename|
+-----+--------+--------+----------+
|84662|유료회원|20230315|      하의|
|84662|유료회원|20230315|      헤어|
|84664|유료회원|20230315|상태메시지|
+-----+--------+--------+----------+
only showing top 3 rows



In [21]:
top_items = reg_member_items.groupBy('codename') \
                            .agg(F.count('codename').alias('wear_count')) \
                            .orderBy(F.desc("wear_count")) \
                            .limit(3)

top_items.show()

+----------+----------+
|  codename|wear_count|
+----------+----------+
|상태메시지|        74|
|      헤어|        67|
|      얼굴|        47|
+----------+----------+



### 4. member_dup 테이블에 학년이 여러개인 idx가 있습니다. 해당 idx의 학년 중 가장 높은 학년만 남겨서 idx와 grade가 1:1 대응이 되도록 만들어주세요

In [22]:
member_dup.show(3)

+-----+---+--------+-------+
|  idx|sex|  status|  grade|
+-----+---+--------+-------+
| 6884| 여|유료회원|초3학년|
| 6331| 남|유료회원|초3학년|
|69294| 남|유료회원|초5학년|
+-----+---+--------+-------+
only showing top 3 rows



In [23]:
member_grade = member_dup.withColumn('grade_reg', F.regexp_extract('grade', '\\d+', 0))
member_grade.show(3)

+-----+---+--------+-------+---------+
|  idx|sex|  status|  grade|grade_reg|
+-----+---+--------+-------+---------+
| 6884| 여|유료회원|초3학년|        3|
| 6331| 남|유료회원|초3학년|        3|
|69294| 남|유료회원|초5학년|        5|
+-----+---+--------+-------+---------+
only showing top 3 rows



In [24]:
window_var = W.Window.partitionBy('idx').orderBy(F.col('grade_reg').desc())
member_match = member_grade.withColumn("rank", F.rank().over(window_var))
member_match \
    .filter(F.col('rank') == 1) \
    .select('idx', 'grade', 'rank').show(10)

+------+-------+----+
|   idx|  grade|rank|
+------+-------+----+
|100010|초4학년|   1|
|100140|초4학년|   1|
| 10096|초5학년|   1|
| 10096|초5학년|   1|
| 10096|초5학년|   1|
| 10436|초6학년|   1|
| 11078|초4학년|   1|
| 11332|초3학년|   1|
| 11563|초3학년|   1|
|  1159|초5학년|   1|
+------+-------+----+
only showing top 10 rows



#### 랭크함수(row_number, rank, dense_rank)
- row_number()
  - 설명: 데이터가 정렬된 순서로 고유한 번호 부여(중복값 있어도 고유한 번호 부여, 중복 순위X)
- rank()
  - 설명: 중복된 값이 있다면 해당 개수만큼 건너 뛰고 다음 번호를 부여
- dense_rank()
  - 중복된 값이 있어도 해당 개수 만큼 건너 뛰지 않고 다음 번호를 연속으로 부여

### 5. 레벨이 151~160 에 있는 유저들 중 딱 한 명씩만 등록한 날짜들을 구해주세요.
  - 레벨은 idx가 가진 레벨중 가장 높은 레벨로 사용(10) -> 중복처리에 유의


In [26]:
item_his.show(3)
item_his.printSchema()

+-----+-----+-------+--------+----------+--------------+-----+
|  idx|   lv|proc_ym|proc_ymd|  codename|   mascodename|price|
+-----+-----+-------+--------+----------+--------------+-----+
|53687|175.0| 202305|20230508|  액세서리|아바타파츠구분|   50|
|53687|175.0| 202305|20230508|상태메시지|  기타파츠구분|   50|
|20163|161.0| 202304|20230408|  액세서리|아바타파츠구분|   50|
+-----+-----+-------+--------+----------+--------------+-----+
only showing top 3 rows

root
 |-- idx: string (nullable = true)
 |-- lv: string (nullable = true)
 |-- proc_ym: string (nullable = true)
 |-- proc_ymd: string (nullable = true)
 |-- codename: string (nullable = true)
 |-- mascodename: string (nullable = true)
 |-- price: string (nullable = true)



In [28]:
item_df = item_his.withColumn('lv', F.col('lv').cast(T.IntegerType())) \
                  .filter((F.col('lv') >= 151) & (F.col('lv') <= 160))
item_df.orderBy(F.col("lv").desc()).show(10)

+-----+---+-------+--------+----------+--------------+-----+
|  idx| lv|proc_ym|proc_ymd|  codename|   mascodename|price|
+-----+---+-------+--------+----------+--------------+-----+
|12109|160| 202304|20230420|    코스튬|아바타파츠구분|  300|
|12109|160| 202304|20230420|      하의|아바타파츠구분|  250|
|12109|160| 202304|20230420|    코스튬|아바타파츠구분|  300|
|30445|160| 202305|20230530|상태메시지|  기타파츠구분|  100|
|12109|160| 202304|20230420|      상의|아바타파츠구분|  200|
|30445|160| 202305|20230530|      신발|아바타파츠구분|  150|
|12109|160| 202304|20230420|      헤어|아바타파츠구분|  150|
|30445|160| 202305|20230530|상태메시지|  기타파츠구분|  100|
|12109|160| 202304|20230420|      하의|아바타파츠구분|  250|
|41510|160| 202304|20230424|      헤어|아바타파츠구분|  150|
+-----+---+-------+--------+----------+--------------+-----+
only showing top 10 rows



In [30]:
window_var = W.Window.partitionBy('idx').orderBy(F.col('lv').desc())
item_df = item_df.withColumn("max_lv", F.row_number().over(window_var)) \
                 .filter(F.col('max_lv') == 1) \
                 .drop('max_lv')
item_df.show(3)

+-----+---+-------+--------+--------+--------------+-----+
|  idx| lv|proc_ym|proc_ymd|codename|   mascodename|price|
+-----+---+-------+--------+--------+--------------+-----+
|12529|155| 202304|20230407|    헤어|아바타파츠구분|  150|
| 1512|155| 202304|20230404|    얼굴|아바타파츠구분|  150|
|18947|156| 202305|20230504|    헤어|아바타파츠구분|  150|
+-----+---+-------+--------+--------+--------------+-----+
only showing top 3 rows



In [35]:
item_reg = item_df.join(regdate, "idx", "inner") \
                  .select("idx", "lv", "regdate")
item_reg.show(3)

+---+---+--------+
|idx| lv| regdate|
+---+---+--------+
| 38|154|20221206|
| 55|155|20221206|
| 70|159|20221206|
+---+---+--------+
only showing top 3 rows



In [36]:
res = item_reg.groupBy('regdate').count() \
              .filter(F.col('count') == 1) \
              .select('regdate')
res.show(10)

+--------+
| regdate|
+--------+
|20230327|
|20230321|
|20230505|
|20230129|
|20230205|
|20230428|
|20230325|
|20230330|
|20230420|
|20230320|
+--------+
only showing top 10 rows



### 1.Spark ML - Linear Regression

In [37]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler

spark = SparkSession.builder \
                    .master("local") \
                    .appName("Colab") \
                    .getOrCreate()

In [38]:
spark

In [39]:
df = spark.read.csv('./sample_data/california_housing_train.csv', header=True, inferSchema=True)

In [40]:
df.printSchema()

root
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- housing_median_age: double (nullable = true)
 |-- total_rooms: double (nullable = true)
 |-- total_bedrooms: double (nullable = true)
 |-- population: double (nullable = true)
 |-- households: double (nullable = true)
 |-- median_income: double (nullable = true)
 |-- median_house_value: double (nullable = true)



In [42]:
print('Shape of the dataset: ', (df.count(), len(df.columns)))

df.show(n=10)

Shape of the dataset:  (17000, 9)
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|  -114.31|   34.19|              15.0|     5612.0|        1283.0|    1015.0|     472.0|       1.4936|           66900.0|
|  -114.47|    34.4|              19.0|     7650.0|        1901.0|    1129.0|     463.0|         1.82|           80100.0|
|  -114.56|   33.69|              17.0|      720.0|         174.0|     333.0|     117.0|       1.6509|           85700.0|
|  -114.57|   33.64|              14.0|     1501.0|         337.0|     515.0|     226.0|       3.1917|           73400.0|
|  -114.57|   33.57|              20.0|     1454.0|         326.0|     624.0|     262.0|        1.925|          

In [43]:
from pyspark.ml.feature import VectorAssembler


featureassembler = VectorAssembler(inputCols=['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income'], outputCol='features')

In [44]:
output = featureassembler.transform(df)
output.show()

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+--------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|            features|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+--------------------+
|  -114.31|   34.19|              15.0|     5612.0|        1283.0|    1015.0|     472.0|       1.4936|           66900.0|[-114.31,34.19,15...|
|  -114.47|    34.4|              19.0|     7650.0|        1901.0|    1129.0|     463.0|         1.82|           80100.0|[-114.47,34.4,19....|
|  -114.56|   33.69|              17.0|      720.0|         174.0|     333.0|     117.0|       1.6509|           85700.0|[-114.56,33.69,17...|
|  -114.57|   33.64|              14.0|     1501.0|         337.0|     515.0|     226.0|       3.1917|           73400.0|[-114.57,33.64,14...|

In [45]:
final_data = output.select('features', 'median_house_value')
final_data.show()

+--------------------+------------------+
|            features|median_house_value|
+--------------------+------------------+
|[-114.31,34.19,15...|           66900.0|
|[-114.47,34.4,19....|           80100.0|
|[-114.56,33.69,17...|           85700.0|
|[-114.57,33.64,14...|           73400.0|
|[-114.57,33.57,20...|           65500.0|
|[-114.58,33.63,29...|           74000.0|
|[-114.58,33.61,25...|           82400.0|
|[-114.59,34.83,41...|           48500.0|
|[-114.59,33.61,34...|           58400.0|
|[-114.6,34.83,46....|           48100.0|
|[-114.6,33.62,16....|           86500.0|
|[-114.6,33.6,21.0...|           62000.0|
|[-114.61,34.84,48...|           48600.0|
|[-114.61,34.83,31...|           70400.0|
|[-114.63,32.76,15...|           45000.0|
|[-114.65,34.89,17...|           69100.0|
|[-114.65,33.6,28....|           94900.0|
|[-114.65,32.79,21...|           25000.0|
|[-114.66,32.74,17...|           44000.0|
|[-114.67,33.92,17...|           27500.0|
+--------------------+------------

In [46]:
train_data, test_data = final_data.randomSplit([0.75, 0.25], seed=42)

In [47]:
(train_data.count(), len(train_data.columns)), (test_data.count(), len(test_data.columns))

((12804, 2), (4196, 2))

In [48]:
from pyspark.ml.regression import LinearRegression


result = LinearRegression(featuresCol='features', labelCol='median_house_value')

In [50]:
result_fit = result.fit(train_data)
result_fit.coefficients

DenseVector([-42701.9912, -42496.281, 1169.8313, -8.3056, 119.616, -37.5491, 41.596, 40607.1182])

In [51]:
pred_train = result_fit.evaluate(train_data)
pred_train.predictions.show()

+--------------------+------------------+------------------+
|            features|median_house_value|        prediction|
+--------------------+------------------+------------------+
|[-124.35,40.54,52...|           94600.0|186896.93006823957|
|[-124.3,41.8,19.0...|           85800.0| 63828.38811664749|
|[-124.27,40.69,36...|           79000.0|154636.36630449584|
|[-124.26,40.58,52...|          111400.0|162922.78442727355|
|[-124.25,40.28,32...|           76100.0| 154696.9893092662|
|[-124.23,40.81,52...|           50800.0| 184516.7200880493|
|[-124.21,40.75,32...|           58100.0|108903.98030509287|
|[-124.21,41.75,20...|           66900.0|  67073.8611378409|
|[-124.21,41.77,17...|           68400.0| 79718.54286785191|
|[-124.19,40.77,30...|           69000.0|143611.76057469426|
|[-124.19,40.78,37...|           70000.0|115794.36983087054|
|[-124.19,41.78,15...|           74600.0| 51738.73723116238|
|[-124.18,40.62,35...|          107000.0|159002.25293956604|
|[-124.18,40.78,33...|  

In [53]:
pred = result_fit.evaluate(test_data)
pred.predictions.show()

+--------------------+------------------+------------------+
|            features|median_house_value|        prediction|
+--------------------+------------------+------------------+
|[-124.3,41.84,17....|          103600.0|101050.39542136434|
|[-124.23,40.54,52...|          106700.0|189362.20494504552|
|[-124.23,41.75,11...|           73200.0| 75903.94192009047|
|[-124.22,41.73,28...|           78300.0| 76114.25266013574|
|[-124.19,40.73,21...|           90100.0| 165469.7540392154|
|[-124.18,40.78,34...|           67000.0|119800.08382594073|
|[-124.17,40.75,13...|          116100.0|199011.75383349974|
|[-124.17,40.8,52....|           62500.0|132272.22601593612|
|[-124.17,40.8,52....|           75500.0|126960.38241179287|
|[-124.16,40.6,39....|           85100.0|147348.16246677423|
|[-124.16,40.78,46...|           81800.0|201518.51471819356|
|[-124.16,40.78,50...|           85400.0|157784.40543071274|
|[-124.16,41.74,15...|           69500.0| 59024.76003741752|
|[-124.15,40.78,36...|  

In [54]:
# R2 Score
print("R2-Score for train set: ", pred_train.r2)
print("R2-Score for test set: ", pred.r2)

R2-Score for train set:  0.6412827357771509
R2-Score for test set:  0.6413328376083546
