In [None]:
import pandas as pd
import torch
import tensorflow as tf
from sklearn.linear_model import LinearRegression
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import zipfile
with zipfile.ZipFile("dataset_small.zip", "r") as zip_ref:
    zip_ref.extractall("data_folder2")

In [None]:
df = pd.read_csv("data_folder2/structured_cleaned_new_dataset.csv")
df.head()


Unnamed: 0,id,file_path,file_size,line_count,extension,language,code,clean_code,clean_line_count,clean_size
0,1,Markdown/000001.md,34784,572,md,Markdown,# Contributing\n\n| Component | Bui...,contributing\n\n component build ...,186,10000
1,2,XML/000002.props,3013,44,props,XML,"﻿<Project ToolsVersion=""15.0"" xmlns=""http://sc...",project toolsversion xmlns\n propertygroup\n ...,44,1812
2,3,Text/000003.txt,1076,21,txt,Text,The MIT License (MIT)\n\nCopyright (c) 2015 Mi...,the mit license mit\n\ncopyright c 2015 micros...,21,1026
3,4,Markdown/000004.md,8105,84,md,Markdown,# Azure SDK for .NET\n\n[![Packages](https://i...,azure sdk for net\n\npackageshttpsimgshieldsi...,84,7244
4,5,Markdown/000005.md,2763,41,md,Markdown,<!-- BEGIN MICROSOFT SECURITY.MD V0.0.5 BLOCK ...,begin microsoft securitymd v005 block \n\n se...,41,2523


In [None]:
missing = df["clean_code"].isnull().sum()
print(f"Missing code rows: {missing}")

Missing code rows: 58


In [None]:
df.dropna()

Unnamed: 0,id,file_path,file_size,line_count,extension,language,code,clean_code,clean_line_count,clean_size
0,1,Markdown/000001.md,34784,572,md,Markdown,# Contributing\n\n| Component | Bui...,contributing\n\n component build ...,186,10000
1,2,XML/000002.props,3013,44,props,XML,"﻿<Project ToolsVersion=""15.0"" xmlns=""http://sc...",project toolsversion xmlns\n propertygroup\n ...,44,1812
2,3,Text/000003.txt,1076,21,txt,Text,The MIT License (MIT)\n\nCopyright (c) 2015 Mi...,the mit license mit\n\ncopyright c 2015 micros...,21,1026
3,4,Markdown/000004.md,8105,84,md,Markdown,# Azure SDK for .NET\n\n[![Packages](https://i...,azure sdk for net\n\npackageshttpsimgshieldsi...,84,7244
4,5,Markdown/000005.md,2763,41,md,Markdown,<!-- BEGIN MICROSOFT SECURITY.MD V0.0.5 BLOCK ...,begin microsoft securitymd v005 block \n\n se...,41,2523
...,...,...,...,...,...,...,...,...,...,...
86222,86229,SQL/086229.sql,333,18,sql,SQL,CREATE SCHEMA TestSchema;\nGO\n\nCREATE TABLE ...,create schema testschema\ngo\n\ncreate table t...,18,253
86223,86230,SQL/086230.sql,420,8,sql,SQL,"WITH a AS (SELECT * FROM (VALUES(1),(2),(3),(4...",with a as select from values12345678910 as aa...,8,304
86224,86231,SQL/086231.sql,333,18,sql,SQL,CREATE SCHEMA TestSchema;\nGO\n\nCREATE TABLE ...,create schema testschema\ngo\n\ncreate table t...,18,253
86225,86232,SQL/086232.sql,420,8,sql,SQL,"WITH a AS (SELECT * FROM (VALUES(1),(2),(3),(4...",with a as select from values12345678910 as aa...,8,304


In [None]:
min_count = 10  # threshold
counts = df["language"].value_counts()

df["language"] = df["language"].apply(
    lambda x: x if counts[x] >= min_count else "Other"
)

print(df["language"].value_counts())

language
Dart          15345
Rust          14128
C#            10000
Go             9124
JSON           5570
              ...  
Puppet           12
RDoc             11
SourcePawn       11
Pascal           11
Gradle           10
Name: count, Length: 61, dtype: int64


In [None]:
print(df.isnull().sum())

# Check how many rows in total have any missing value
print("Rows with missing values:", df.isnull().any(axis=1).sum())

id                   0
file_path            0
file_size            0
line_count           0
extension            0
language             0
code                56
clean_code          58
clean_line_count     0
clean_size           0
dtype: int64
Rows with missing values: 58


# start implimenting your model (split data intilize the model extra )

# Task
Create a CNN + LSTM model using the provided data.

## Data preprocessing

### Subtask:
Prepare the data for the CNN-LSTM model. This includes handling missing values, encoding categorical features, and splitting the data into training and testing sets.


**Reasoning**:
Drop rows with missing values in the `clean_code` column, convert the `language` column to categorical codes, select features and target, and split the data into training and testing sets.



In [None]:
df.dropna(subset=["clean_code"], inplace=True)

df["language_code"] = df["language"].astype("category").cat.codes

X = df[["clean_size", "clean_line_count", "language_code"]]
y = df["language_code"]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (68935, 3)
X_test shape: (17234, 3)
y_train shape: (68935,)
y_test shape: (17234,)


## Model definition

### Subtask:
Define the architecture of the CNN-LSTM model using TensorFlow or PyTorch. This will involve adding CNN layers for spatial feature extraction and LSTM layers for sequential pattern recognition.


**Reasoning**:
Define the architecture of the CNN-LSTM model using TensorFlow.



In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Reshape

# Define the input shape based on X_train.
# The shape should be (samples, timesteps, features).
# Since our current features are not sequential in the timestep sense,
# we will treat each feature as a single timestep for simplicity
# or reshape if necessary for a more complex sequential interpretation.
# For this subtask, we'll treat the 3 features as timesteps for the Conv1D.
input_shape = (X_train.shape[1], 1) # (timesteps, features)

model = Sequential()

# Add Conv1D layer
model.add(Reshape(input_shape, input_shape=(X_train.shape[1],))) # Reshape to add the timestep dimension
model.add(Conv1D(filters=32, kernel_size=2, activation='relu', padding='same'))

# Add MaxPooling1D layer
model.add(MaxPooling1D(pool_size=2))

# Add LSTM layer
# We need to flatten or use a Dense layer after LSTM if not using return_sequences=True
model.add(LSTM(units=50))

# Add Dense layers
num_classes = len(df["language_code"].unique())
model.add(Dense(units=num_classes, activation='softmax'))

# Print the model summary
model.summary()

  super().__init__(**kwargs)


## Model compilation

### Subtask:
Compile the model by specifying the optimizer, loss function, and metrics.


**Reasoning**:
Compile the defined TensorFlow model with the specified optimizer, loss function, and metrics.



In [None]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

## Model training

### Subtask:
Train the CNN-LSTM model on the training data.


**Reasoning**:
Train the compiled model using the training data with specified epochs, batch size, and validation split, and store the training history.



In [None]:
history = model.fit(X_train, y_train, epochs=20, batch_size=64, validation_split=0.2)

Epoch 1/20
[1m862/862[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 7ms/step - accuracy: 0.2765 - loss: 2.7681 - val_accuracy: 0.4036 - val_loss: 2.2385
Epoch 2/20
[1m862/862[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.4044 - loss: 2.2150 - val_accuracy: 0.4703 - val_loss: 2.0467
Epoch 3/20
[1m862/862[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.4673 - loss: 2.0300 - val_accuracy: 0.4882 - val_loss: 1.9503
Epoch 4/20
[1m862/862[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.4979 - loss: 1.8605 - val_accuracy: 0.5459 - val_loss: 1.6742
Epoch 5/20
[1m862/862[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step - accuracy: 0.5570 - loss: 1.6654 - val_accuracy: 0.5900 - val_loss: 1.5511
Epoch 6/20
[1m862/862[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.5943 - loss: 1.5257 - val_accuracy: 0.6205 - val_loss: 1.3875
Epoch 7/20
[1m862/862[0m 

**Reasoning**:
The first step is to load the data from the CSV file into a pandas DataFrame.



**Reasoning**:
The previous attempt to load the data failed because the file 'car_data.csv' was not found. I need to list the files in the current directory to find the correct file name.



In [None]:
import os

# List files in the current directory
print(os.listdir('.'))

['.config', 'dataset_small.zip', 'data_folder2', 'structured_cleaned_new_dataset.csv', 'sample_data']


**Reasoning**:
The previous command showed that the file 'car_data.csv' does not exist. However, 'dataset_small.zip' is present. It is likely that the data is contained within this zip file. I need to extract the zip file and then load the data from the extracted file.



In [None]:
import zipfile

# Unzip the dataset.
with zipfile.ZipFile('dataset_small.zip', 'r') as zip_ref:
    zip_ref.extractall('.')

# List files in the current directory again to confirm extraction
print(os.listdir('.'))

['.config', 'dataset_small.zip', 'data_folder2', 'structured_cleaned_new_dataset.csv', 'sample_data']


**Reasoning**:
The previous command successfully extracted the zip file and the file 'structured_cleaned_new_dataset.csv' is now present. I can now load this CSV file into a pandas DataFrame.



In [None]:
# Load the dataframe.
df_car_data = pd.read_csv('structured_cleaned_new_dataset.csv')

# Print some info.
df_car_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86227 entries, 0 to 86226
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                86227 non-null  int64 
 1   file_path         86227 non-null  object
 2   file_size         86227 non-null  int64 
 3   line_count        86227 non-null  int64 
 4   extension         86227 non-null  object
 5   language          86227 non-null  object
 6   code              86171 non-null  object
 7   clean_code        86169 non-null  object
 8   clean_line_count  86227 non-null  int64 
 9   clean_size        86227 non-null  int64 
dtypes: int64(5), object(5)
memory usage: 6.6+ MB


## Finish task

Summarize the findings from the model evaluation and present the performance metrics (loss and accuracy) of the CNN-LSTM model.

## Model evaluation

### Subtask:
Evaluate the trained CNN-LSTM model on the testing data to assess its performance.

**Reasoning**:
Evaluate the trained model using the testing data to determine its accuracy and loss on unseen data.

In [None]:
# Evaluate the model on the test data
loss, accuracy = model.evaluate(X_test, y_test)

print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

[1m539/539[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8515 - loss: 0.4923
Test Loss: 0.4910
Test Accuracy: 0.8505
