## Data loading and initial inspection

### 

Load the provided dataset into a pandas DataFrame and perform an initial inspection to understand the data structure, types, and missing values.


In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("/content/final_dataset.csv")

# Display the first 5 rows
print("First 5 rows of the DataFrame:")
display(df.head())

# Print concise summary
print("\nConcise summary of the DataFrame:")
df.info()

# Display data types of each column
print("\nData types of each column:")
display(df.dtypes)

# Count missing values in each column
print("\nMissing values per column:")
display(df.isnull().sum())

Identify unique values in the specified columns and apply one-hot encoding to `committype` and `timeofcommit`. Then process `fileextensions` and apply `MultilabelBinarizer`. Finally, concatenate the results and drop the original columns.



In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

# Identify unique values
print("Unique values in 'committype':", df['committype'].unique())
print("Unique values in 'timeofcommit':", df['timeofcommit'].unique())

# Process 'fileextensions' to extract individual extensions
# Assuming fileextensions are stored as strings representing lists, we need to parse them
df['fileextensions_list'] = df['fileextensions'].apply(lambda x: [ext.strip("np.str_('").strip("')") for ext in x.strip("[]").split("', '") if ext])

all_extensions = [ext for sublist in df['fileextensions_list'] for ext in sublist]
print("Unique values in 'fileextensions':", np.unique(all_extensions))

# One-hot encode 'committype' and 'timeofcommit'
df_encoded = pd.get_dummies(df, columns=['committype', 'timeofcommit'], drop_first=False)

# Applying MultiLabelBinarizer to 'fileextensions_list'
mlb = MultiLabelBinarizer()
fileextensions_encoded = mlb.fit_transform(df_encoded['fileextensions_list'])
fileextensions_df = pd.DataFrame(fileextensions_encoded, columns=[f'fileextension_{cls}' for cls in mlb.classes_])

# Concatenate the new features and drop original columns
df_processed = pd.concat([df_encoded.drop(columns=['fileextensions', 'fileextensions_list']), fileextensions_df], axis=1)

display(df_processed.head())

## Preprocessing - handling text data

### 
Preprocess the `commitmessage` text data. This will  involve steps like cleaning the text, tokenization, removing stop words, and potentially using techniques like TF-IDF or word embeddings to represent the text numerically.


In [None]:
# Download necessary NLTK data
import nltk
nltk.download('punkt_tab')

import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """
    Preprocesses text data by lowercasing, removing punctuation, tokenizing,
    removing stop words, and lemmatizing.
    """
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(tokens)

# Apply the preprocessing function
df_processed['commitmessage_processed'] = df_processed['commitmessage'].apply(preprocess_text)

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))

# Fit and transform the processed text data
tfidf_features = tfidf_vectorizer.fit_transform(df_processed['commitmessage_processed']).toarray()

# Convert TF-IDF features to a DataFrame
tfidf_df = pd.DataFrame(tfidf_features, columns=[f'tfidf_{i}' for i in range(tfidf_features.shape[1])])

# Concatenate TF-IDF features with the existing DataFrame
df_processed = pd.concat([df_processed, tfidf_df], axis=1)

# Drop the original and intermediate commitmessage columns
df_processed = df_processed.drop(columns=['commitmessage', 'commitmessage_processed'])

display(df_processed.head())

## Preprocessing - handling numerical features

###
Ensure numerical features like `numfileschanged`, `linesadded`, `linesdeleted`, and `numcommentsadded` are in a suitable format for modeling. This might involve scaling or normalization if necessary.


In [None]:
from sklearn.preprocessing import StandardScaler

# Select numerical columns for scaling
numerical_cols = ['numfileschanged', 'linesadded', 'linesdeleted', 'numcommentsadded']
df_numerical = df_processed[numerical_cols]

# Initialize and fit StandardScaler
scaler = StandardScaler()
scaled_numerical_data = scaler.fit_transform(df_numerical)

# Replace original columns with scaled data
df_processed[numerical_cols] = scaled_numerical_data

# Display the head of the df_processed DataFrame
display(df_processed.head())