# Sklearn Implementation

In [56]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,precision_score,recall_score,f1_score
from sklearn.linear_model import LogisticRegression
import time

In [57]:
data=pd.read_csv('train.csv')

In [58]:
data

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,794,1,0.5,1,0,1,2,0.8,106,6,...,1222,1890,668,13,4,19,1,1,0,0
1996,1965,1,2.6,1,0,0,39,0.2,187,4,...,915,1965,2032,11,10,16,1,1,1,2
1997,1911,0,0.9,1,1,1,36,0.7,108,8,...,868,1632,3057,9,1,5,1,1,0,3
1998,1512,0,0.9,0,4,1,46,0.1,145,5,...,336,670,869,18,10,19,1,1,1,0


In [59]:
X=data.drop('blue',axis=1)



In [60]:
y=data['blue']

In [61]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42,test_size=0.2)

In [62]:
model=LogisticRegression()


start_time = time.time()
model.fit(X_train,y_train)

end_time = time.time()
training_time = end_time - start_time

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [63]:
pred=model.predict(X_test)

In [64]:
print("Accuracy:", accuracy_score(y_test, pred))
print("\nClassification Report:\n", classification_report(y_test, pred))


Accuracy: 0.5

Classification Report:
               precision    recall  f1-score   support

           0       0.49      0.59      0.54       195
           1       0.52      0.41      0.46       205

    accuracy                           0.50       400
   macro avg       0.50      0.50      0.50       400
weighted avg       0.50      0.50      0.50       400



In [68]:
precision = precision_score(y_test, pred, average='weighted')
recall = recall_score(y_test, pred, average='weighted')
f1 = f1_score(y_test, pred, average='weighted')
error_rate = 1 - accuracy

In [69]:
print(f"Model Precision: {precision:.2f}")
print(f"Model Recall: {recall:.2f}")
print(f"Model F1 Score: {f1:.2f}")
print(f"Model Error Rate: {error_rate:.2f}")
print(f"Training Time: {training_time:.2f} seconds")

Model Precision: 0.50
Model Recall: 0.50
Model F1 Score: 0.50
Model Error Rate: 0.18
Training Time: 0.03 seconds


In [72]:
from sklearn.metrics import log_loss
y_prob = model.predict(X_test)
log_loss_value = log_loss(y_test, y_prob)
print(f"Log Loss: {log_loss_value:.2f}")


Log Loss: 17.27


# Pyspark Implementation

In [67]:
pip install pyspark

Note: you may need to restart the kernel to use updated packages.


In [34]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import col
import time

In [35]:
spark = SparkSession.builder \
    .appName("Mobile Price Classification") \
    .getOrCreate()

In [36]:
data_path = "train.csv"  
df = spark.read.csv(data_path, header=True, inferSchema=True)

In [37]:
df.printSchema()
df.show(5)


root
 |-- battery_power: integer (nullable = true)
 |-- blue: integer (nullable = true)
 |-- clock_speed: double (nullable = true)
 |-- dual_sim: integer (nullable = true)
 |-- fc: integer (nullable = true)
 |-- four_g: integer (nullable = true)
 |-- int_memory: integer (nullable = true)
 |-- m_dep: double (nullable = true)
 |-- mobile_wt: integer (nullable = true)
 |-- n_cores: integer (nullable = true)
 |-- pc: integer (nullable = true)
 |-- px_height: integer (nullable = true)
 |-- px_width: integer (nullable = true)
 |-- ram: integer (nullable = true)
 |-- sc_h: integer (nullable = true)
 |-- sc_w: integer (nullable = true)
 |-- talk_time: integer (nullable = true)
 |-- three_g: integer (nullable = true)
 |-- touch_screen: integer (nullable = true)
 |-- wifi: integer (nullable = true)
 |-- price_range: integer (nullable = true)

+-------------+----+-----------+--------+---+------+----------+-----+---------+-------+---+---------+--------+----+----+----+---------+-------+------------

In [38]:
indexer = StringIndexer(inputCol='price_range', outputCol='label')
df = indexer.fit(df).transform(df)

In [39]:
feature_columns = [
    'battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep',
    'mobile_wt', 'n_cores', 'pc', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen',
    'wifi'
]

In [40]:
assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')
df = assembler.transform(df)


In [41]:
df = df.select(col('features'), col('label'))

In [42]:
train_df, test_df = df.randomSplit([0.8, 0.2], seed=1234)

In [43]:
lr = LogisticRegression(featuresCol='features', labelCol='label')

In [44]:
start_time = time.time()
lr_model = lr.fit(train_df)
end_time = time.time()
training_time = end_time - start_time

In [45]:
predictions = lr_model.transform(test_df)

In [46]:
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(predictions)


In [47]:
evaluator_precision = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='precisionByLabel')
precision = evaluator_precision.evaluate(predictions)


In [48]:

evaluator_recall = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='recallByLabel')
recall = evaluator_recall.evaluate(predictions)

In [49]:
evaluator_f1 = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='fMeasureByLabel')
f1_score = evaluator_f1.evaluate(predictions)

In [50]:
print(f"Model Accuracy: {accuracy:.2f}")
print(f"Model Precision: {precision:.2f}")
print(f"Model Recall: {recall:.2f}")
print(f"Model F1 Score: {f1_score:.2f}")
print(f"Training Time: {training_time:.2f} seconds")

training_summary = lr_model.summary
objective_history = training_summary.objectiveHistory
print(f"Objective History (Loss per Iteration): {objective_history}")

Model Accuracy: 0.82
Model Precision: 0.90
Model Recall: 0.92
Model F1 Score: 0.91
Training Time: 1.34 seconds
Objective History (Loss per Iteration): [1.386175759335101, 1.0283189378476145, 0.8091648234646248, 0.6648317840912846, 0.5554540496030896, 0.4646713253931445, 0.4167437996137804, 0.3897265182388017, 0.3851316078189323, 0.3746874432901736, 0.37387430445291564, 0.3735804658276038, 0.37326334153460294, 0.37306367117734623, 0.37284385114640706, 0.37268616602910537, 0.3725744022502088, 0.37252890436219777, 0.37249504531382294, 0.3724860715298976, 0.3724822025724086, 0.3724790625569326, 0.3724754690495085, 0.3724723553984226, 0.3724690553036002, 0.37246759817985414, 0.37246485233570265, 0.37246352619485845, 0.37245989195998735, 0.3724581419185625, 0.3724572554946225, 0.37245652394516154, 0.37245633709019327, 0.37245629807447395, 0.3724562755569873, 0.37245625455470466, 0.3724562511294615, 0.37245624072983385, 0.37245623823218693, 0.3724562353740705, 0.37245623357006613, 0.372456233