In [13]:
#!pip install pyspark


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline, PipelineModel

In [3]:
spark = SparkSession.builder \
.appName("PipelineOps") \
.master("local[2]") \
.config("spark.executor.memory","4g") \
.config("spark.driver.memory","4g") \
.getOrCreate()

In [4]:
df = spark.read \
.option("header","True") \
.option("sep",",") \
.option("inferSchema","True") \
.format("csv") \
.load("/content/simple_data.csv")

In [5]:
df.show()

+------+--------+---+-----------+--------+-----------+
|sirano|    isim|yas|     meslek|   sehir|aylik_gelir|
+------+--------+---+-----------+--------+-----------+
|     1|   Cemal| 35|       Isci|  Ankara|       3500|
|     2|   Ceyda| 42|      Memur| Kayseri|       4200|
|     3|   Timur| 30|   Müzisyen|Istanbul|       9000|
|     4|   Burcu| 29|Pazarlamaci|  Ankara|       4200|
|     5| Yasemin| 23|Pazarlamaci|   Bursa|       4800|
|     6|     Ali| 33|      Memur|  Ankara|       4250|
|     7|   Dilek| 29|Pazarlamaci|Istanbul|       7300|
|     8|   Murat| 31|   Müzisyen|Istanbul|      12000|
|     9|   Ahmet| 33|     Doktor|  Ankara|      18000|
|    10|Muhittin| 46|     Berber|Istanbul|      12000|
|    11|Hicaziye| 47| Tuhafiyeci|  Ankara|       4800|
|    12|   Harun| 43|    Tornacı|  Ankara|       4200|
|    13|   Hakkı| 33|      Memur|   Çorum|       3750|
|    14| Gülizar| 37|     Doktor|   İzmir|      14250|
|    15|  Şehmuz| 41|   Müzisyen|  Ankara|       8700|
+------+--

In [6]:
df1 = df.withColumn("ekonomik_durum",
    F.when(F.col("aylik_gelir") > 7000, "iyi").otherwise("kötü")
)

In [10]:
train_df, test_df = df1.randomSplit([0.8, 0.2], seed=142)

## 🔍Pipeline

In [8]:
meslek_indexer = StringIndexer() \
.setInputCol("meslek") \
.setOutputCol("meslek_index") \
.setHandleInvalid("skip")


sehir_indexer = StringIndexer() \
.setInputCol("sehir") \
.setOutputCol("sehir_index") \
.setHandleInvalid("skip")


encoder = OneHotEncoder() \
.setInputCols(["meslek_index","sehir_index"]) \
.setOutputCols(["meslek_encoded","sehir_encoded"])


assembler = VectorAssembler() \
.setInputCols(["yas","aylik_gelir","meslek_encoded","sehir_encoded"]) \
.setOutputCol("vectorized_features")

label_indexer = StringIndexer() \
.setInputCol("ekonomik_durum") \
.setOutputCol("label")


scaler = StandardScaler() \
.setInputCol("vectorized_features") \
.setOutputCol("features")


lr_object = LogisticRegression() \
.setFeaturesCol("features") \
.setLabelCol("label") \
.setPredictionCol("prediction")


pipeline_nesnesi = Pipeline() \
.setStages([
            meslek_indexer,\
            sehir_indexer,\
            encoder,\
            assembler,\
            label_indexer,\
            scaler,\
            lr_object])

In [11]:
pipeline_modeli = pipeline_nesnesi.fit(train_df)

In [12]:
pipeline_modeli.transform(test_df).select("label","prediction").toPandas().head()

Unnamed: 0,label,prediction
0,0.0,0.0
1,1.0,1.0
2,1.0,0.0
