In [21]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, classification_report
import joblib


# Exploratory Data Analysis (EDA)
import matplotlib.pyplot as plt
import seaborn as sns



In [20]:
# Function to load JSONL file and convert to DataFrame
def jsonl_to_dataframe(file_path, num_lines=500):
    data = []
    with open(file_path, 'r') as f:
        for idx, line in enumerate(f):
            if idx >= num_lines:
                break
            try:
                data.append(json.loads(line))
            except json.JSONDecodeError:
                continue
    df = pd.DataFrame(data)
    return df

In [3]:
# Function to extract and flatten band details
def extract_band_details(df):
    category_band_info = []
    for band_detail in df['bandDetails']:
        if band_detail and len(band_detail) > 0:
            bands = []
            for category in band_detail:
                category_name = category.get('categoryName', 'Unknown')
                band_info = category.get('bandInfo', [{'bandName': 'Unknown'}])
                band_names = ', '.join([band['bandName'] for band in band_info])
                bands.append(f"{category_name}: {band_names}")
            category_band_info.append('; '.join(bands))
        else:
            category_band_info.append('Unknown')
    df['bandDetails_flat'] = category_band_info
    return df

# Function to calculate the number of bands
def calculate_num_bands(band_detail):
    num_bands = 0
    if band_detail and len(band_detail) > 0:
        for category in band_detail:
            band_info = category.get('bandInfo', [])
            for band in band_info:
                band_names = band['bandName'].split(',')
                num_bands += len(band_names)
    return num_bands

In [4]:
# Path to the JSONL file
file_path = 'DeviceDatabase_first_500.jsonl'

In [5]:
# Load JSONL file and convert to DataFrame (using only the first 500 lines)
df = jsonl_to_dataframe(file_path, num_lines=500)

In [6]:
# Data Cleaning and Transformation
df.fillna('Unknown', inplace=True)
df['allocationYear'] = pd.to_datetime(df['allocationDate'], format="%d-%b-%Y", errors='coerce').dt.year

In [7]:
# Select relevant columns, excluding 'operatingSystem'
columns_to_keep = ['tac', 'manufacturer', 'modelName', 'allocationDate', 'deviceType', 'bandDetails', 'allocationYear']
df = df[columns_to_keep]

In [8]:
# Calculate the number of bands for each device
df['numBands'] = df['bandDetails'].apply(calculate_num_bands)


In [9]:
# Extract band details into a flat string
df = extract_band_details(df)

TAC

In [10]:
# Break the TAC into columns
df['reportingBodyId'] = df['tac'].astype(str).str[:2]
df['manufacturerModelId'] = df['tac'].astype(str).str[2:]

In [11]:
# Display the updated DataFrame
print(df[['tac', 'reportingBodyId', 'manufacturerModelId']].head())

        tac reportingBodyId manufacturerModelId
0  00102100              00              102100
1  00102200              00              102200
2  00102300              00              102300
3  00102400              00              102400
4  00102500              00              102500


In [12]:
# Prepare the data
features = df[['reportingBodyId', 'manufacturerModelId']].apply(pd.to_numeric)
target = df[['numBands']].apply(pd.to_numeric)

In [13]:
# Split the data (using only 'numBands' as target)
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [14]:
# Train a RandomForestRegressor for the 'numBands' target variable
regressor = RandomForestRegressor(n_estimators=100, random_state=42)
regressor.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [15]:
# Evaluate the model
y_pred = regressor.predict(X_test)

In [16]:
# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error for 'numBands': {mse}")

# Print overall MSE 
print(f"Overall Mean Squared Error: {mse}")

Mean Squared Error for 'numBands': 9.682359000000002
Overall Mean Squared Error: 9.682359000000002


In [17]:
# Display the updated DataFrame with new features
print(df[['tac', 'manufacturer', 'modelName', 'allocationDate', 'deviceType', 'allocationYear', 'numBands', 'bandDetails_flat']].head())

        tac          manufacturer         modelName allocationDate deviceType  \
0  00102100       Shintom Co. Ltd           MX-5010    21-May-2010   Handheld   
1  00102200                 Intel           Zoarmon    21-May-2010   Handheld   
2  00102300               Samsung          SGH-t829    21-May-2010   Handheld   
3  00102400          Telular Corp  Telguard 5 (TG5)    21-May-2010   Handheld   
4  00102500  Option International            Fizgig    21-May-2010   Handheld   

   allocationYear  numBands                                   bandDetails_flat  
0            2010         4  2G/3G: GSM 900,GSM 1800,GSM 1900; Radio Interf...  
1            2010         5  2G/3G: GSM850 (GSM800),GSM 900,GSM 1800,GSM 19...  
2            2010         5  2G/3G: GSM850 (GSM800),GSM 900,GSM 1800,GSM 19...  
3            2010         3  2G/3G: GSM850 (GSM800),GSM 1900; Radio Interfa...  
4            2010         4  2G/3G: GSM 900,GSM 1800,GSM 1900; Radio Interf...  


In [27]:
# Save the model
joblib.dump(regressor, 'tac_predictor_model.pkl')

OSError: [Errno 28] No space left on device

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Example: Classification model to predict deviceType
# Prepare the data
features = df[['numBands', 'allocationYear']]
target = df['deviceType']

# Encode categorical target
target_encoded = target.astype('category').cat.codes

# Split the data
X_train, X_test, y_train, y_test = train_test_split(features, target_encoded, test_size=0.2, random_state=42)

# Train the model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        90
           1       0.71      1.00      0.83         5
           2       1.00      0.60      0.75         5

    accuracy                           0.98       100
   macro avg       0.90      0.87      0.86       100
weighted avg       0.99      0.98      0.98       100

