In [1]:
#!pip install pyspark

In [2]:

from pyspark.sql import SparkSession
spark = SparkSession.builder \
.master("local[4]") \
.appName("DataExplore") \
.config("spark.executor.memory","4g") \
.config("spark.driver.memory","4g") \
.getOrCreate()

In [3]:
adult_train_df = spark.read \
.option("header","True") \
.option("inferSchema","True") \
.option("sep",",") \
.csv("/content/adult.data") \

In [4]:
adult_test_df = spark.read \
.option("header","True") \
.option("inferSchema","True") \
.option("sep",",") \
.csv("/content/adult.test") \

In [5]:
adult_train_df.show(5)

+---+-----------------+--------+----------+-------------+-------------------+------------------+--------------+------+-------+------------+------------+--------------+--------------+------+
|age|        workclass|  fnlwgt| education|education_num|     marital_status|        occupation|  relationship|  race|    sex|capital_gain|capital_loss|hours_per_week|native_country|output|
+---+-----------------+--------+----------+-------------+-------------------+------------------+--------------+------+-------+------------+------------+--------------+--------------+------+
| 39|        State-gov| 77516.0| Bachelors|         13.0|      Never-married|      Adm-clerical| Not-in-family| White|   Male|      2174.0|         0.0|          40.0| United-States| <=50K|
| 50| Self-emp-not-inc| 83311.0| Bachelors|         13.0| Married-civ-spouse|   Exec-managerial|       Husband| White|   Male|         0.0|         0.0|          13.0| United-States| <=50K|
| 38|          Private|215646.0|   HS-grad|       

In [6]:
adult_train_df.limit(5).toPandas().head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,output
0,39,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


In [7]:
adult_test_df.limit(5).toPandas().head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,output
0,25,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K.
1,38,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K.
2,28,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K.
3,44,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K.
4,18,?,103497.0,Some-college,10.0,Never-married,?,Own-child,White,Female,0.0,0.0,30.0,United-States,<=50K.


In [8]:
adult_whole_df = adult_train_df.union(adult_test_df)
adult_whole_df.limit(5).toPandas().head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,output
0,39,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


In [9]:
print("train satır sayısı: ", adult_train_df.count())
print("test satır sayısı:  ", adult_test_df.count())
print("whole satır sayısı: ", adult_whole_df.count())

train satır sayısı:  32561
test satır sayısı:   16281
whole satır sayısı:  48842


In [10]:
adult_whole_df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: double (nullable = true)
 |-- education: string (nullable = true)
 |-- education_num: double (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital_gain: double (nullable = true)
 |-- capital_loss: double (nullable = true)
 |-- hours_per_week: double (nullable = true)
 |-- native_country: string (nullable = true)
 |-- output: string (nullable = true)



### ***🔍Numeric Features***

In [11]:
adult_whole_df.describe(["age","fnlwgt","education_num","capital_gain","capital_loss","hours_per_week"]).toPandas().head()

Unnamed: 0,summary,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,count,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0
1,mean,38.64358543876172,189664.13459727284,10.078088530363212,1079.0676262233324,87.50231358257237,40.422382375824085
2,stddev,13.710509934443564,105604.02542315732,2.570972755592263,7452.019057655401,403.0045521243599,12.3914440242523
3,min,17.0,12285.0,1.0,0.0,0.0,1.0
4,max,90.0,1490400.0,16.0,99999.0,4356.0,99.0


In [12]:
from pyspark.sql import functions as F
adult_whole_df.groupBy(F.col("workclass")) \
.agg({"*":"count"}) \
.toPandas().head(10)

Unnamed: 0,workclass,count(1)
0,State-gov,1981
1,Federal-gov,1432
2,Self-emp-not-inc,3862
3,Local-gov,3136
4,Private,33906
5,?,2799
6,Self-emp-inc,1695
7,Without-pay,21
8,Never-worked,10


### ***categorical Features***

In [13]:
from pyspark.sql import functions as F

adult_whole_df.groupBy(F.col("workclass")) \
.agg({"*":"count"}) \
.toPandas().head(10)

Unnamed: 0,workclass,count(1)
0,State-gov,1981
1,Federal-gov,1432
2,Self-emp-not-inc,3862
3,Local-gov,3136
4,Private,33906
5,?,2799
6,Self-emp-inc,1695
7,Without-pay,21
8,Never-worked,10


In [14]:
adult_whole_df.groupBy(F.col("education")) \
.agg({"*":"count"}) \
.toPandas().head(20)

Unnamed: 0,education,count(1)
0,Prof-school,834
1,10th,1389
2,7th-8th,955
3,5th-6th,509
4,Assoc-acdm,1601
5,Assoc-voc,2061
6,Masters,2657
7,12th,657
8,Preschool,83
9,9th,756


In [15]:
adult_whole_df.groupBy(F.col("marital_status")) \
.agg({"*":"count"}) \
.toPandas().head(20)

Unnamed: 0,marital_status,count(1)
0,Widowed,1518
1,Married-spouse-absent,628
2,Married-AF-spouse,37
3,Married-civ-spouse,22379
4,Divorced,6633
5,Never-married,16117
6,Separated,1530


In [16]:
adult_whole_df.groupBy(F.col("occupation")) \
.agg({"*":"count"}) \
.toPandas().head(20)

Unnamed: 0,occupation,count(1)
0,Farming-fishing,1490
1,Handlers-cleaners,2072
2,Prof-specialty,6172
3,Adm-clerical,5611
4,Exec-managerial,6086
5,Craft-repair,6112
6,Sales,5504
7,?,2809
8,Tech-support,1446
9,Transport-moving,2355


In [17]:
adult_whole_df.groupBy(F.col("relationship")) \
.agg({"*":"count"}) \
.toPandas().head(20)

Unnamed: 0,relationship,count(1)
0,Husband,19716
1,Own-child,7581
2,Not-in-family,12583
3,Other-relative,1506
4,Wife,2331
5,Unmarried,5125


In [18]:
adult_whole_df.groupBy(F.col("race")) \
.agg({"*":"count"}) \
.toPandas().head(20)

Unnamed: 0,race,count(1)
0,Asian-Pac-Islander,1519
1,Black,4685
2,Other,406
3,White,41762
4,Amer-Indian-Eskimo,470


In [19]:
adult_whole_df.groupBy(F.col("sex")) \
.agg({"*":"count"}) \
.toPandas().head(20)

Unnamed: 0,sex,count(1)
0,Male,32650
1,Female,16192


In [20]:
adult_whole_df.groupBy(F.col("native_country")) \
.agg({"*":"count"}) \
.toPandas().head(50)

Unnamed: 0,native_country,count(1)
0,Dominican-Republic,103
1,Ireland,37
2,Cuba,138
3,Guatemala,88
4,Iran,59
5,Taiwan,65
6,El-Salvador,155
7,United-States,43832
8,South,115
9,Japan,92


In [21]:
adult_whole_df.groupBy(F.col("output")) \
.agg({"*":"count"}) \
.toPandas().head(20)

Unnamed: 0,output,count(1)
0,>50K,7841
1,<=50K,24720
2,>50K.,3846
3,<=50K.,12435


In [22]:
from pyspark.sql.functions import *

In [23]:
categorical_columns = ["workclass", "education", "marital_status", "occupation", "relationship", "race",
          "sex", "native_country", "output"]
numerical_columns= ["age", "fnlwgt", "education_num", "capital_gain", "capital_loss", "hours_per_week"]

In [24]:
adult_whole_df.toPandas().head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,output
0,39,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


In [25]:
for c in adult_whole_df.columns:
  if c in categorical_columns:
    adult_whole_df = adult_whole_df.withColumn(c, trim(col(c)))

adult_whole_df.show(5)


+---+----------------+--------+---------+-------------+------------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+
|age|       workclass|  fnlwgt|education|education_num|    marital_status|       occupation| relationship| race|   sex|capital_gain|capital_loss|hours_per_week|native_country|output|
+---+----------------+--------+---------+-------------+------------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+
| 39|       State-gov| 77516.0|Bachelors|         13.0|     Never-married|     Adm-clerical|Not-in-family|White|  Male|      2174.0|         0.0|          40.0| United-States| <=50K|
| 50|Self-emp-not-inc| 83311.0|Bachelors|         13.0|Married-civ-spouse|  Exec-managerial|      Husband|White|  Male|         0.0|         0.0|          13.0| United-States| <=50K|
| 38|         Private|215646.0|  HS-grad|          9.0|          Divorced|Handlers-cl

In [26]:
adult_whole_df1 = adult_whole_df \
.withColumn("workclass", trim(col("workclass"))) \
.withColumn("education", trim(col("education"))) \
.withColumn("marital_status", trim(col("marital_status"))) \
.withColumn("occupation", trim(col("occupation"))) \
.withColumn("relationship", trim(col("relationship"))) \
.withColumn("race", trim(col("race"))) \
.withColumn("sex", trim(col("sex"))) \
.withColumn("native_country", trim(col("native_country"))) \
.withColumn("output", trim(col("output")))

In [27]:
adult_whole_df.count()

48842

In [28]:
adult_whole_df1.count()

48842

In [29]:
adult_whole_df2 = adult_whole_df1 \
.withColumn("output", regexp_replace(col("output"), "<=50K.","<=50K")) \
.withColumn("output", regexp_replace(col("output"), ">50K.",">50K"))

In [30]:
adult_whole_df2.groupBy(col("output")).agg({"*":"count"}) \
.toPandas().head()

Unnamed: 0,output,count(1)
0,<=50K,37155
1,>50K,11687


In [31]:
sayac_for_null = 1
for sutun in adult_whole_df2.columns:
    if(adult_whole_df2.filter(col(sutun).isNull()).count() > 0):
        print(sayac_for_null, ". ", sutun, " içinde null değer var.")
    else:
        print(sayac_for_null, ". ", sutun)
    sayac_for_null+=1

1 .  age
2 .  workclass
3 .  fnlwgt
4 .  education
5 .  education_num
6 .  marital_status
7 .  occupation
8 .  relationship
9 .  race
10 .  sex
11 .  capital_gain
12 .  capital_loss
13 .  hours_per_week
14 .  native_country
15 .  output


In [32]:
sayac_for_question = 1
for sutun in adult_whole_df2.columns:
    if(adult_whole_df2.filter(col(sutun).contains("?")).count() > 0):
        print(sayac_for_question, ". ", sutun, " içinde ? var.")
    else:
        print(sayac_for_question, ". ", sutun)
    sayac_for_question+=1

1 .  age
2 .  workclass  içinde ? var.
3 .  fnlwgt
4 .  education
5 .  education_num
6 .  marital_status
7 .  occupation  içinde ? var.
8 .  relationship
9 .  race
10 .  sex
11 .  capital_gain
12 .  capital_loss
13 .  hours_per_week
14 .  native_country  içinde ? var.
15 .  output


In [33]:
adult_whole_df2.select("workclass", "occupation", "native_country", "output") \
.filter(col("workclass").contains("?") | col("occupation").contains("?") | col("native_country").contains("?")) \
.groupBy("workclass", "occupation", "native_country", "output").count() \
.orderBy(col("count").desc()) \
.toPandas().head(10)

Unnamed: 0,workclass,occupation,native_country,output,count
0,?,?,United-States,<=50K,2284
1,?,?,United-States,>50K,246
2,Private,Other-service,?,<=50K,100
3,Private,Sales,?,<=50K,55
4,Private,Prof-specialty,?,<=50K,51
5,Private,Craft-repair,?,<=50K,48
6,Private,Prof-specialty,?,>50K,48
7,?,?,Mexico,<=50K,48
8,Private,Adm-clerical,?,<=50K,47
9,Private,Machine-op-inspct,?,<=50K,42


In [34]:
adult_whole_df3 = adult_whole_df2 \
        .filter(~(col("workclass").contains("?") | col("occupation").contains("?") | col("native_country").contains("?")))

In [35]:
print(adult_whole_df2.count())
print(adult_whole_df3.count())

48842
45222


In [36]:
adult_whole_df4 = adult_whole_df3 \
.filter(~(col("workclass").contains("never-worked") | col("workclass").contains("without-pay") |
          col("occupation").contains("Armed-Forces")))

In [37]:
print(adult_whole_df3.count())
print(adult_whole_df4.count())

45222
45208


In [39]:
adult_whole_df5 = adult_whole_df4.withColumn("education_merged", when(col("education").isin("1st-4th","education","5th-6th","education","7th-8th"), "Elementary-School")
.when(col("education").isin("9th","10th","11th","12th"), "High-School")
.when(col("education").isin("Masters","Doctorate"), "Post-Graduate")
.when(col("education").isin("Bachelors","Some-college"), "Under-Graduate")
.otherwise(col("education")))


adult_whole_df5.select("education","education_merged").toPandas().head()

Unnamed: 0,education,education_merged
0,Bachelors,Under-Graduate
1,Bachelors,Under-Graduate
2,HS-grad,HS-grad
3,11th,High-School
4,Bachelors,Under-Graduate


In [40]:
adult_whole_df5.groupBy("education_merged").agg({"*":"count"}).toPandas().head(20)

Unnamed: 0,education_merged,count(1)
0,Assoc-acdm,1507
1,Assoc-voc,1959
2,Elementary-School,1494
3,High-School,4094
4,HS-grad,14778
5,Preschool,72
6,Post-Graduate,3056
7,Under-Graduate,17464
8,Prof-school,784


In [41]:
adult_whole_df6 = adult_whole_df5.filter(~(col("native_country") == "Holand-Netherlands"))

In [42]:
print(adult_whole_df5.count())
print(adult_whole_df6.count())

45208
45207


In [43]:
nitelik_siralama = ["workclass", "education", "education_merged", "marital_status", "occupation", "relationship", "race",
     "sex", "native_country", "age", "fnlwgt", "education_num", "capital_gain", "capital_loss", "hours_per_week","output"]
adult_whole_df7 = adult_whole_df6.select(nitelik_siralama)

In [44]:
adult_whole_df7.toPandas().head()

Unnamed: 0,workclass,education,education_merged,marital_status,occupation,relationship,race,sex,native_country,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,output
0,State-gov,Bachelors,Under-Graduate,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,39,77516.0,13.0,2174.0,0.0,40.0,<=50K
1,Self-emp-not-inc,Bachelors,Under-Graduate,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,50,83311.0,13.0,0.0,0.0,13.0,<=50K
2,Private,HS-grad,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,38,215646.0,9.0,0.0,0.0,40.0,<=50K
3,Private,11th,High-School,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,53,234721.0,7.0,0.0,0.0,40.0,<=50K
4,Private,Bachelors,Under-Graduate,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,28,338409.0,13.0,0.0,0.0,40.0,<=50K


In [45]:
 adult_whole_df7 \
      .coalesce(1) \
      .write \
      .mode("overwrite") \
      .option("sep",",") \
      .option("header","true") \
      .csv("/content/adult_preprocessing.csv")