### Data loading

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [44]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer
import re
from spellchecker import SpellChecker
import pymystem3

[nltk_data] Downloading package punkt to /Users/alina/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/alina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
df = pd.read_csv('/Users/alina/Downloads/bug_data_15000.csv')

### Simple data cleaning

In [46]:
# making a copy of df to make changes and preprocess
dff = df.copy()

In [47]:
# flattening of target categories
dff['Product_component'] = dff['Product'] + ' ' + dff['Component']
dff = dff.drop(columns=['Product', 'Component'])

In [48]:
# drop rows where data in target and important columns is missing
dff = dff.dropna(subset=['Product_component', 'Description'])

# replace NaN values based on the mode (most frequent category of the column)
mode_value = dff['Importance'].mode()[0]
dff['Importance'].fillna(mode_value, inplace=True)

# replace NaN values with a specific text
dff['Title'].fillna('Unknown', inplace=True)

## data cleaning and preparation

### solving imbalance

In [None]:
# function for dropping categories which contain less than 0.005% of the total amount of error reports
# Calculate the percentage of each category
category_percentages = dff["Product_component"].value_counts() / dff["Product_component"].count()

# Create a list of categories to be dropped
categories_to_drop = category_percentages[category_percentages < 0.005].index.tolist()

# Select the rows where the "Product" column is in the list of categories to be dropped
rows_to_drop = dff[dff["Product_component"].isin(categories_to_drop)].index

# Drop the selected rows
dff.drop(rows_to_drop, inplace=True)

In [None]:
# split the dataset into training and testing sets

X = dff[['Title', 'Description', 'Importance']]
y = dff['Product_component']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42, stratify=y)

In [None]:
# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('text_title', CountVectorizer(), 'Title'),
        ('text_desc', CountVectorizer(), 'Description'),
        ('cat_importance', OneHotEncoder(handle_unknown='ignore'), ['Importance'])
    ],
    remainder='passthrough'
)

In [None]:
# Fit and transform the training set
X_train_processed = preprocessor.fit_transform(X_train)

# Transform the test set
X_test_processed = preprocessor.transform(X_test)

# Encode the target feature
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
y_train_encoded = encoder.fit_transform(y_train.values.reshape(-1, 1))
y_test_encoded = encoder.transform(y_test.values.reshape(-1, 1))

In [None]:
# Define the pipeline with preprocessing and classifier
pipeline = Pipeline([
    ('clf', LogisticRegression())
])

In [None]:
# Fit the pipeline on training data
pipeline.fit(X_train_processed, y_train_encoded)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# Evaluate on test data
y_pred = pipeline.predict(X_test_processed)

# Decode the numerical labels back to their original form
y_test_decoded = encoder.inverse_transform(y_test_encoded.reshape(-1, 1))
y_pred_decoded = encoder.inverse_transform(y_pred.reshape(-1, 1))

# Convert the arrays to pandas DataFrames
y_test_decoded = pd.DataFrame(y_test_decoded)
y_pred_decoded = pd.DataFrame(y_pred_decoded)

# Replace NaN values with a default value
default_value = 'Unknown'
y_test_decoded = y_test_decoded.fillna(default_value)
y_pred_decoded = y_pred_decoded.fillna(default_value)

report = classification_report(y_test_decoded, y_pred_decoded)
print('Classification Report:\n', report)


Classification Report:
                     precision    recall  f1-score   support

      CDT cdt-core       0.75      0.14      0.24        21
     CDT cdt-debug       0.88      0.47      0.61        15
     CDT cdt-other       0.50      0.22      0.31        18
          JDT Core       0.64      0.63      0.64       227
         JDT Debug       0.68      0.69      0.68       254
            JDT UI       0.67      0.79      0.73       631
            PDE UI       0.81      0.54      0.65        93
      Platform Ant       0.56      0.56      0.56        18
  Platform Compare       0.67      0.39      0.49        41
    Platform Debug       0.43      0.37      0.40        75
Platform Resources       0.46      0.39      0.42       106
      Platform SWT       0.72      0.65      0.68       251
     Platform Team       0.81      0.86      0.83       306
       Platform UI       0.66      0.67      0.67       510
   Platform Update       0.69      0.74      0.71        57
     Platform U

### regex cleaning

In [None]:
# split the dataset into training and testing sets

X = dff[['Title', 'Description', 'Importance']]
y = dff['Product_component']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42)

In [None]:
def preprocess_text(text):
    # lowercase the text
    text = text.lower()
    # remove URLs
    text = re.sub(r'http\S+', '', text)
    # remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    # remove hashtags (keep the words without the '#')
    text = re.sub(r'#(\w+)', r'\1', text)
    # remove special characters and punctuation
    text = re.sub(r'[^A-Za-z0-9]+', ' ', text)
    # remove timestamps
    text = re.sub(r'\(\d{1,2}/\d{1,2}/\d{2} \d{1,2}:\d{2}:\d{2} (AM|PM)\)', ' ', text)
    # remove numbers
    text = re.sub(r"\d+", "", text)
    # remove code snippets
    # matches code snippets enclosed in triple backticks (```)
    text = re.sub(r'```.+?```', ' ', text, flags=re.DOTALL)

    return text

In [None]:
# Apply the preprocessing function to the text data
X_train['Title'] = X_train['Title'].apply(preprocess_text)
X_train['Description'] = X_train['Description'].apply(preprocess_text)
X_test['Title'] = X_test['Title'].apply(preprocess_text)
X_test['Description'] = X_test['Description'].apply(preprocess_text)

In [None]:
# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('text_title', CountVectorizer(), 'Title'),
        ('text_desc', CountVectorizer(), 'Description'),
        ('cat_importance', OneHotEncoder(handle_unknown='ignore'), ['Importance'])
    ],
    remainder='passthrough'
)

In [None]:
# Fit and transform the training set
X_train_processed = preprocessor.fit_transform(X_train)

# Transform the test set
X_test_processed = preprocessor.transform(X_test)

# Encode the target feature
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
y_train_encoded = encoder.fit_transform(y_train.values.reshape(-1, 1))
y_test_encoded = encoder.transform(y_test.values.reshape(-1, 1))

In [None]:
# Define the pipeline with preprocessing and classifier
pipeline = Pipeline([
    ('clf', LogisticRegression())
])

In [None]:
# Fit the pipeline on training data
pipeline.fit(X_train_processed, y_train_encoded)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# Evaluate on test data
y_pred = pipeline.predict(X_test_processed)

# Decode the numerical labels back to their original form
y_test_decoded = encoder.inverse_transform(y_test_encoded.reshape(-1, 1))
y_pred_decoded = encoder.inverse_transform(y_pred.reshape(-1, 1))

# Convert the arrays to pandas DataFrames
y_test_decoded = pd.DataFrame(y_test_decoded)
y_pred_decoded = pd.DataFrame(y_pred_decoded)

# Replace NaN values with a default value
default_value = 'Unknown'
y_test_decoded = y_test_decoded.fillna(default_value)
y_pred_decoded = y_pred_decoded.fillna(default_value)

report = classification_report(y_test_decoded, y_pred_decoded)
print('Classification Report:\n', report)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report:
                     precision    recall  f1-score   support

      CDT cdt-core       0.57      0.18      0.28        22
     CDT cdt-debug       0.50      0.05      0.08        22
       CDT cdt-doc       0.00      0.00      0.00         4
     CDT cdt-other       0.14      0.07      0.10        14
 Equinox Incubator       0.00      0.00      0.00         3
          JDT Core       0.69      0.62      0.65       226
         JDT Debug       0.62      0.65      0.64       253
           JDT Doc       0.00      0.00      0.00         2
          JDT Text       0.00      0.00      0.00        18
            JDT UI       0.65      0.76      0.70       629
         PDE Build       0.33      0.25      0.29         4
            PDE UI       0.60      0.53      0.56        89
      Platform Ant       0.73      0.69      0.71        16
      Platform CVS       0.00      0.00      0.00         1
  Platform Compare       0.59      0.41      0.49        41
    Platform De

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### spellcheck

In [None]:
# split the dataset into training and testing sets

X = dff[['Title', 'Description', 'Importance']]
y = dff['Product_component']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42)

In [None]:
# initialize the spell checker object
spell = SpellChecker()

def preprocess_text(text):
    try:
        # Tokenize the text
        tokens = nltk.word_tokenize(text)
        
        # Correct misspelled words
        corrected_tokens = [spell.correction(token) for token in tokens]
        
        # Join the corrected tokens into a string
        processed_text = " ".join(corrected_tokens)
        
        return processed_text
    except TypeError:
        return ''

In [None]:
# Apply the preprocessing function to the text data
X_train['Title'] = X_train['Title'].apply(preprocess_text)
X_train['Description'] = X_train['Description'].apply(preprocess_text)
X_test['Title'] = X_test['Title'].apply(preprocess_text)
X_test['Description'] = X_test['Description'].apply(preprocess_text)

In [None]:
# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('text_title', CountVectorizer(), 'Title'),
        ('text_desc', CountVectorizer(), 'Description'),
        ('cat_importance', OneHotEncoder(handle_unknown='ignore'), ['Importance'])
    ],
    remainder='passthrough'
)

In [None]:
# Fit and transform the training set
X_train_processed = preprocessor.fit_transform(X_train)

# Transform the test set
X_test_processed = preprocessor.transform(X_test)

# Encode the target feature
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
y_train_encoded = encoder.fit_transform(y_train.values.reshape(-1, 1))
y_test_encoded = encoder.transform(y_test.values.reshape(-1, 1))

In [None]:
# Define the pipeline with preprocessing and classifier
pipeline = Pipeline([
    ('clf', LogisticRegression())
])

In [None]:
# Fit the pipeline on training data
pipeline.fit(X_train_processed, y_train_encoded)

In [None]:
# Evaluate on test data
y_pred = pipeline.predict(X_test_processed)

# Decode the numerical labels back to their original form
y_test_decoded = encoder.inverse_transform(y_test_encoded.reshape(-1, 1))
y_pred_decoded = encoder.inverse_transform(y_pred.reshape(-1, 1))

# Convert the arrays to pandas DataFrames
y_test_decoded = pd.DataFrame(y_test_decoded)
y_pred_decoded = pd.DataFrame(y_pred_decoded)

# Replace NaN values with a default value
default_value = 'Unknown'
y_test_decoded = y_test_decoded.fillna(default_value)
y_pred_decoded = y_pred_decoded.fillna(default_value)

report = classification_report(y_test_decoded, y_pred_decoded)
print('Classification Report:\n', report)


### stemming: Porter Stemmer

In [12]:
# split the dataset into training and testing sets

X = dff[['Title', 'Description', 'Importance']]
y = dff['Product_component']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42)

In [11]:
stemmer = PorterStemmer()

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove the stopwords
    tokens = [token for token in tokens]
    # Stem the tokens
    tokens = [stemmer.stem(token) for token in tokens]

    # Join the tokens back into a string
    text = ' '.join(tokens)

    return text

In [14]:
# Apply the preprocessing function to the text data
X_train['Title'] = X_train['Title'].apply(preprocess_text)
X_train['Description'] = X_train['Description'].apply(preprocess_text)
X_test['Title'] = X_test['Title'].apply(preprocess_text)
X_test['Description'] = X_test['Description'].apply(preprocess_text)

In [15]:
# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('text_title', CountVectorizer(), 'Title'),
        ('text_desc', CountVectorizer(), 'Description'),
        ('cat_importance', OneHotEncoder(handle_unknown='ignore'), ['Importance'])
    ],
    remainder='passthrough'
)

# Fit and transform the training set
X_train_processed = preprocessor.fit_transform(X_train)

# Transform the test set
X_test_processed = preprocessor.transform(X_test)

# Encode the target feature
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
y_train_encoded = encoder.fit_transform(y_train.values.reshape(-1, 1))
y_test_encoded = encoder.transform(y_test.values.reshape(-1, 1))

# Define the pipeline with preprocessing and classifier
pipeline = Pipeline([
    ('clf', LogisticRegression())
])

# Fit the pipeline on training data
pipeline.fit(X_train_processed, y_train_encoded)

# Evaluate on test data
y_pred = pipeline.predict(X_test_processed)

# Decode the numerical labels back to their original form
y_test_decoded = encoder.inverse_transform(y_test_encoded.reshape(-1, 1))
y_pred_decoded = encoder.inverse_transform(y_pred.reshape(-1, 1))

# Convert the arrays to pandas DataFrames
y_test_decoded = pd.DataFrame(y_test_decoded)
y_pred_decoded = pd.DataFrame(y_pred_decoded)

# Replace NaN values with a default value
default_value = 'Unknown'
y_test_decoded = y_test_decoded.fillna(default_value)
y_pred_decoded = y_pred_decoded.fillna(default_value)

report = classification_report(y_test_decoded, y_pred_decoded)
print('Classification Report:\n', report)

  y = column_or_1d(y, warn=True)


Classification Report:
                     precision    recall  f1-score   support

      CDT cdt-core       0.50      0.18      0.27        22
     CDT cdt-debug       1.00      0.05      0.09        22
       CDT cdt-doc       0.00      0.00      0.00         4
     CDT cdt-other       0.60      0.21      0.32        14
 Equinox Incubator       0.00      0.00      0.00         3
          JDT Core       0.70      0.67      0.68       226
         JDT Debug       0.63      0.68      0.65       253
           JDT Doc       0.00      0.00      0.00         2
          JDT Text       0.00      0.00      0.00        18
            JDT UI       0.67      0.77      0.72       629
         PDE Build       0.50      0.25      0.33         4
            PDE UI       0.61      0.56      0.58        89
      Platform Ant       0.87      0.81      0.84        16
      Platform CVS       0.00      0.00      0.00         1
  Platform Compare       0.67      0.54      0.59        41
    Platform De

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### stemming: SnowballStemmer

In [19]:
from nltk.stem.snowball import SnowballStemmer

# split the dataset into training and testing sets

X = dff[['Title', 'Description', 'Importance']]
y = dff['Product_component']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42)

In [20]:
stemmer = SnowballStemmer('english')

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove the stopwords
    tokens = [token for token in tokens]
    # Stem the tokens
    tokens = [stemmer.stem(token) for token in tokens]

    # Join the tokens back into a string
    text = ' '.join(tokens)

    return text

In [21]:
# Apply the preprocessing function to the text data
X_train['Title'] = X_train['Title'].apply(preprocess_text)
X_train['Description'] = X_train['Description'].apply(preprocess_text)
X_test['Title'] = X_test['Title'].apply(preprocess_text)
X_test['Description'] = X_test['Description'].apply(preprocess_text)

In [22]:
# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('text_title', CountVectorizer(), 'Title'),
        ('text_desc', CountVectorizer(), 'Description'),
        ('cat_importance', OneHotEncoder(handle_unknown='ignore'), ['Importance'])
    ],
    remainder='passthrough'
)

# Fit and transform the training set
X_train_processed = preprocessor.fit_transform(X_train)

# Transform the test set
X_test_processed = preprocessor.transform(X_test)

# Encode the target feature
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
y_train_encoded = encoder.fit_transform(y_train.values.reshape(-1, 1))
y_test_encoded = encoder.transform(y_test.values.reshape(-1, 1))

# Define the pipeline with preprocessing and classifier
pipeline = Pipeline([
    ('clf', LogisticRegression())
])

# Fit the pipeline on training data
pipeline.fit(X_train_processed, y_train_encoded)

# Evaluate on test data
y_pred = pipeline.predict(X_test_processed)

# Decode the numerical labels back to their original form
y_test_decoded = encoder.inverse_transform(y_test_encoded.reshape(-1, 1))
y_pred_decoded = encoder.inverse_transform(y_pred.reshape(-1, 1))

# Convert the arrays to pandas DataFrames
y_test_decoded = pd.DataFrame(y_test_decoded)
y_pred_decoded = pd.DataFrame(y_pred_decoded)

# Replace NaN values with a default value
default_value = 'Unknown'
y_test_decoded = y_test_decoded.fillna(default_value)
y_pred_decoded = y_pred_decoded.fillna(default_value)

report = classification_report(y_test_decoded, y_pred_decoded)
print('Classification Report:\n', report)

  y = column_or_1d(y, warn=True)


Classification Report:
                     precision    recall  f1-score   support

      CDT cdt-core       0.50      0.23      0.31        22
     CDT cdt-debug       0.50      0.09      0.15        22
       CDT cdt-doc       0.00      0.00      0.00         4
     CDT cdt-other       0.71      0.36      0.48        14
 Equinox Incubator       0.00      0.00      0.00         3
          JDT Core       0.70      0.67      0.68       226
         JDT Debug       0.62      0.65      0.63       253
           JDT Doc       0.00      0.00      0.00         2
          JDT Text       0.25      0.06      0.09        18
            JDT UI       0.68      0.73      0.71       629
         PDE Build       0.50      0.25      0.33         4
            PDE UI       0.63      0.63      0.63        89
      Platform Ant       0.87      0.81      0.84        16
      Platform CVS       0.00      0.00      0.00         1
  Platform Compare       0.65      0.54      0.59        41
    Platform De

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### lemmatization: WordnetLemmatizer

In [30]:
X = dff[['Title', 'Description', 'Importance']]
y = dff['Product_component']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42)

In [31]:
import nltk
from nltk.stem.wordnet import WordNetLemmatizer

# Download the punkt tokenizer, and WordNet database
nltk.download('punkt')
nltk.download('wordnet')

# Define the preprocessing steps
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove the stopwords
    tokens = [token for token in tokens]

    # Lemmatize the tokens
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join the tokens back into a string
    text = ' '.join(tokens)

    return text

# Apply the preprocessing function to the text data
X_train['Title'] = X_train['Title'].apply(preprocess_text)
X_train['Description'] = X_train['Description'].apply(preprocess_text)
X_test['Title'] = X_test['Title'].apply(preprocess_text)
X_test['Description'] = X_test['Description'].apply(preprocess_text)

[nltk_data] Downloading package punkt to /Users/alina/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/alina/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [32]:
# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('text_title', CountVectorizer(), 'Title'),
        ('text_desc', CountVectorizer(), 'Description'),
        ('cat_importance', OneHotEncoder(handle_unknown='ignore'), ['Importance'])
    ],
    remainder='passthrough'
)

# Fit and transform the training set
X_train_processed = preprocessor.fit_transform(X_train)

# Transform the test set
X_test_processed = preprocessor.transform(X_test)

# Encode the target feature
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
y_train_encoded = encoder.fit_transform(y_train.values.reshape(-1, 1))
y_test_encoded = encoder.transform(y_test.values.reshape(-1, 1))

# Define the pipeline with preprocessing and classifier
pipeline = Pipeline([
    ('clf', LogisticRegression())
])

# Fit the pipeline on training data
pipeline.fit(X_train_processed, y_train_encoded)

# Evaluate on test data
y_pred = pipeline.predict(X_test_processed)

# Decode the numerical labels back to their original form
y_test_decoded = encoder.inverse_transform(y_test_encoded.reshape(-1, 1))
y_pred_decoded = encoder.inverse_transform(y_pred.reshape(-1, 1))

# Convert the arrays to pandas DataFrames
y_test_decoded = pd.DataFrame(y_test_decoded)
y_pred_decoded = pd.DataFrame(y_pred_decoded)

# Replace NaN values with a default value
default_value = 'Unknown'
y_test_decoded = y_test_decoded.fillna(default_value)
y_pred_decoded = y_pred_decoded.fillna(default_value)

report = classification_report(y_test_decoded, y_pred_decoded)
print('Classification Report:\n', report)

  y = column_or_1d(y, warn=True)


Classification Report:
                     precision    recall  f1-score   support

      CDT cdt-core       0.50      0.18      0.27        22
     CDT cdt-debug       0.50      0.05      0.08        22
       CDT cdt-doc       0.00      0.00      0.00         4
     CDT cdt-other       0.62      0.36      0.45        14
 Equinox Incubator       0.00      0.00      0.00         3
          JDT Core       0.69      0.66      0.68       226
         JDT Debug       0.61      0.67      0.64       253
           JDT Doc       0.00      0.00      0.00         2
          JDT Text       0.17      0.06      0.08        18
            JDT UI       0.69      0.76      0.72       629
         PDE Build       0.33      0.25      0.29         4
            PDE UI       0.58      0.61      0.59        89
      Platform Ant       0.85      0.69      0.76        16
      Platform CVS       0.00      0.00      0.00         1
  Platform Compare       0.63      0.54      0.58        41
    Platform De

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### lemmatization: spacy

In [36]:
X = dff[['Title', 'Description', 'Importance']]
y = dff['Product_component']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42)

In [41]:
import spacy

# Load the English language model
nlp = spacy.load('en_core_web_sm')

# Define the preprocessing function
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    # Tokenize the text with spaCy
    doc = nlp(text)

    # Lemmatize the tokens
    tokens = [token.lemma_ for token in doc]

    # Join the tokens back into a string
    text = ' '.join(tokens)

    return text

# Apply the preprocessing function to the text data
X_train['Title'] = X_train['Title'].apply(preprocess_text)
X_train['Description'] = X_train['Description'].apply(preprocess_text)
X_test['Title'] = X_test['Title'].apply(preprocess_text)
X_test['Description'] = X_test['Description'].apply(preprocess_text)



In [42]:
# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('text_title', CountVectorizer(), 'Title'),
        ('text_desc', CountVectorizer(), 'Description'),
        ('cat_importance', OneHotEncoder(handle_unknown='ignore'), ['Importance'])
    ],
    remainder='passthrough'
)

# Fit and transform the training set
X_train_processed = preprocessor.fit_transform(X_train)

# Transform the test set
X_test_processed = preprocessor.transform(X_test)

# Encode the target feature
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
y_train_encoded = encoder.fit_transform(y_train.values.reshape(-1, 1))
y_test_encoded = encoder.transform(y_test.values.reshape(-1, 1))

# Define the pipeline with preprocessing and classifier
pipeline = Pipeline([
    ('clf', LogisticRegression())
])

# Fit the pipeline on training data
pipeline.fit(X_train_processed, y_train_encoded)

# Evaluate on test data
y_pred = pipeline.predict(X_test_processed)

# Decode the numerical labels back to their original form
y_test_decoded = encoder.inverse_transform(y_test_encoded.reshape(-1, 1))
y_pred_decoded = encoder.inverse_transform(y_pred.reshape(-1, 1))

# Convert the arrays to pandas DataFrames
y_test_decoded = pd.DataFrame(y_test_decoded)
y_pred_decoded = pd.DataFrame(y_pred_decoded)

# Replace NaN values with a default value
default_value = 'Unknown'
y_test_decoded = y_test_decoded.fillna(default_value)
y_pred_decoded = y_pred_decoded.fillna(default_value)

report = classification_report(y_test_decoded, y_pred_decoded)
print('Classification Report:\n', report)

  y = column_or_1d(y, warn=True)


Classification Report:
                     precision    recall  f1-score   support

      CDT cdt-core       0.62      0.23      0.33        22
     CDT cdt-debug       1.00      0.14      0.24        22
       CDT cdt-doc       0.00      0.00      0.00         4
     CDT cdt-other       0.60      0.21      0.32        14
 Equinox Incubator       0.00      0.00      0.00         3
          JDT Core       0.65      0.64      0.65       226
         JDT Debug       0.68      0.61      0.64       253
           JDT Doc       0.00      0.00      0.00         2
          JDT Text       0.10      0.06      0.07        18
            JDT UI       0.68      0.76      0.72       629
         PDE Build       0.33      0.25      0.29         4
            PDE UI       0.62      0.58      0.60        89
      Platform Ant       0.85      0.69      0.76        16
      Platform CVS       0.00      0.00      0.00         1
  Platform Compare       0.64      0.44      0.52        41
    Platform De

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### lemmatization: MyStem

In [45]:
import pymystem3

In [49]:
X = dff[['Title', 'Description', 'Importance']]
y = dff['Product_component']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42)

In [50]:
# Define the preprocessing steps
mystem = pymystem3.Mystem()

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove the stopwords
    tokens = [token for token in tokens]

    # Stem the tokens
    tokens = [mystem.lemmatize(token)[0] for token in tokens]

    # Join the tokens back into a string
    text = ' '.join(tokens)

    return text

# Apply the preprocessing function to the text data
X_train['Title'] = X_train['Title'].apply(preprocess_text)
X_train['Description'] = X_train['Description'].apply(preprocess_text)
X_test['Title'] = X_test['Title'].apply(preprocess_text)
X_test['Description'] = X_test['Description'].apply(preprocess_text)

Installing mystem to /Users/alina/.local/bin/mystem from http://download.cdn.yandex.net/mystem/mystem-3.1-macosx.tar.gz


In [51]:
# Evaluate on test data
y_pred = pipeline.predict(X_test_processed)

# Decode the numerical labels back to their original form
y_test_decoded = encoder.inverse_transform(y_test_encoded.reshape(-1, 1))
y_pred_decoded = encoder.inverse_transform(y_pred.reshape(-1, 1))

# Convert the arrays to pandas DataFrames
y_test_decoded = pd.DataFrame(y_test_decoded)
y_pred_decoded = pd.DataFrame(y_pred_decoded)

# Replace NaN values with a default value
default_value = 'Unknown'
y_test_decoded = y_test_decoded.fillna(default_value)
y_pred_decoded = y_pred_decoded.fillna(default_value)

report = classification_report(y_test_decoded, y_pred_decoded)
print('Classification Report:\n', report)

Classification Report:
                     precision    recall  f1-score   support

      CDT cdt-core       0.62      0.23      0.33        22
     CDT cdt-debug       1.00      0.14      0.24        22
       CDT cdt-doc       0.00      0.00      0.00         4
     CDT cdt-other       0.60      0.21      0.32        14
 Equinox Incubator       0.00      0.00      0.00         3
          JDT Core       0.65      0.64      0.65       226
         JDT Debug       0.68      0.61      0.64       253
           JDT Doc       0.00      0.00      0.00         2
          JDT Text       0.10      0.06      0.07        18
            JDT UI       0.68      0.76      0.72       629
         PDE Build       0.33      0.25      0.29         4
            PDE UI       0.62      0.58      0.60        89
      Platform Ant       0.85      0.69      0.76        16
      Platform CVS       0.00      0.00      0.00         1
  Platform Compare       0.64      0.44      0.52        41
    Platform De

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## mixed preparation techniques

### solved imbalance + mystem lemmatization

In [55]:
# function for dropping categories which contain less than 0.005% of the total amount of error reports
# Calculate the percentage of each category
category_percentages = dff["Product_component"].value_counts() / dff["Product_component"].count()

# Create a list of categories to be dropped
categories_to_drop = category_percentages[category_percentages < 0.005].index.tolist()

# Select the rows where the "Product" column is in the list of categories to be dropped
rows_to_drop = dff[dff["Product_component"].isin(categories_to_drop)].index

# Drop the selected rows
dff.drop(rows_to_drop, inplace=True)

# split the dataset into training and testing sets

X = dff[['Title', 'Description', 'Importance']]
y = dff['Product_component']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42, stratify=y)

In [56]:
# Define the preprocessing steps
mystem = pymystem3.Mystem()

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove the stopwords
    tokens = [token for token in tokens]

    # Stem the tokens
    tokens = [mystem.lemmatize(token)[0] for token in tokens]

    # Join the tokens back into a string
    text = ' '.join(tokens)

    return text

# Apply the preprocessing function to the text data
X_train['Title'] = X_train['Title'].apply(preprocess_text)
X_train['Description'] = X_train['Description'].apply(preprocess_text)
X_test['Title'] = X_test['Title'].apply(preprocess_text)
X_test['Description'] = X_test['Description'].apply(preprocess_text)

# Evaluate on test data
y_pred = pipeline.predict(X_test_processed)

# Decode the numerical labels back to their original form
y_test_decoded = encoder.inverse_transform(y_test_encoded.reshape(-1, 1))
y_pred_decoded = encoder.inverse_transform(y_pred.reshape(-1, 1))

# Convert the arrays to pandas DataFrames
y_test_decoded = pd.DataFrame(y_test_decoded)
y_pred_decoded = pd.DataFrame(y_pred_decoded)

# Replace NaN values with a default value
default_value = 'Unknown'
y_test_decoded = y_test_decoded.fillna(default_value)
y_pred_decoded = y_pred_decoded.fillna(default_value)

report = classification_report(y_test_decoded, y_pred_decoded)
print('Classification Report:\n', report)

Classification Report:
                     precision    recall  f1-score   support

      CDT cdt-core       0.62      0.23      0.33        22
     CDT cdt-debug       1.00      0.14      0.24        22
       CDT cdt-doc       0.00      0.00      0.00         4
     CDT cdt-other       0.60      0.21      0.32        14
 Equinox Incubator       0.00      0.00      0.00         3
          JDT Core       0.65      0.64      0.65       226
         JDT Debug       0.68      0.61      0.64       253
           JDT Doc       0.00      0.00      0.00         2
          JDT Text       0.10      0.06      0.07        18
            JDT UI       0.68      0.76      0.72       629
         PDE Build       0.33      0.25      0.29         4
            PDE UI       0.62      0.58      0.60        89
      Platform Ant       0.85      0.69      0.76        16
      Platform CVS       0.00      0.00      0.00         1
  Platform Compare       0.64      0.44      0.52        41
    Platform De

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## data preparation + TF-IDF vectorization

In [None]:
# split the dataset into training and testing sets

X = dff[['Title', 'Description', 'Importance']]
y = dff['Product_component']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42)

In [None]:
# apply the preprocessing function to the text data
X_train['Title'] = X_train['Title'].apply(preprocess_text)
X_train['Description'] = X_train['Description'].apply(preprocess_text)
X_test['Title'] = X_test['Title'].apply(preprocess_text)
X_test['Description'] = X_test['Description'].apply(preprocess_text)

In [None]:
# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('text_title', CountVectorizer(), 'Title'),
        ('text_desc', CountVectorizer(), 'Description'),
        ('cat_importance', OneHotEncoder(handle_unknown='ignore'), ['Importance'])
    ],
    remainder='passthrough'
)