<a href="https://colab.research.google.com/github/CrissRMFI/TA047R-2C2024-GRUPO08/blob/main/TA047R_TP2_GRUPO08_ENTREGA_N3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import xgboost as xgb
from sklearn.model_selection import cross_val_score

In [None]:
train_set = pd.read_csv("./train.csv")
test_set = pd.read_csv("./test.csv")

# ANÁLISIS EXPLORATORIO

In [None]:
train_set.head()

Unnamed: 0,id,title,description,project,storypoint
0,5660,Error enabling Appcelerator services during ap...,"When creating the default app, I encountered t...",project8,3
1,9014,Create a maintenance branch,"As a developer, I'd like to have a maintenance...",project6,5
2,4094,Service Activity Monitoring Backend integrated...,SAM API used by SAM GUI,project1,5
3,811,fs::enter(rootfs) does not work if 'rootfs' is...,I noticed this when I was testing the unified ...,project5,2
4,4459,transform processor with script option is broken,Creating the following stream throws exception...,project6,2


In [None]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7900 entries, 0 to 7899
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           7900 non-null   int64 
 1   title        7900 non-null   object
 2   description  7900 non-null   object
 3   project      7900 non-null   object
 4   storypoint   7900 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 308.7+ KB


# Pre-Procesamiento

# Separamos los datos en variables de entrada y etiquetas

In [None]:
X_train_text = train_set[['title','description','project']]
Y_train = train_set['storypoint']

In [None]:
X_train_text

Unnamed: 0,title,description,project
0,Error enabling Appcelerator services during ap...,"When creating the default app, I encountered t...",project8
1,Create a maintenance branch,"As a developer, I'd like to have a maintenance...",project6
2,Service Activity Monitoring Backend integrated...,SAM API used by SAM GUI,project1
3,fs::enter(rootfs) does not work if 'rootfs' is...,I noticed this when I was testing the unified ...,project5
4,transform processor with script option is broken,Creating the following stream throws exception...,project6
...,...,...,...
7895,"As Patrick, I want to be able to create a new ...",# Dialog is shown # fields we need to populat...,project7
7896,GMock warning in ReservationTest.ACLMultipleOp...,{noformat} [ RUN ] ReservationTest.ACLMu...,project5
7897,WSDL Improvement of the SOAP based Service Loc...,As developer I want to use a service that foll...,project1
7898,Platform Config Wizard: Auto-check if Tizen is...,This is an extension of TISTUD-5246. When the ...,project8


In [None]:
Y_train.head()

Unnamed: 0,storypoint
0,3
1,5
2,5
3,2
4,2


In [None]:
Y_train.value_counts()

Unnamed: 0_level_0,count
storypoint,Unnamed: 1_level_1
3,1848
5,1693
1,1629
2,1276
8,1025
4,166
13,154
10,32
20,31
6,17


## Vectorizamos el texto

## Método TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X_train_title_desc_tfidf = vectorizer.fit_transform(X_train_text['title'] + ' ' + X_train_text['description']).toarray()

# Codificacion del atributo Project

### El objetivo es transformar las variables categoricas a numericas, ya que el modelo que aplicaremos no puede trabajar directamente con datos categóricos. Esto es para que el modelo interprete las categorías.

### Usaremos de la biblioteca **sklearn** la herramienta **LabelEncoder**

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
X_train_project = label_encoder.fit_transform(X_train_text['project']).reshape(-1, 1)

In [None]:
X_train_project

array([[7],
       [5],
       [0],
       ...,
       [0],
       [7],
       [7]])

A continuación concatenamos todas las caracteristicas del set de entrenamiento. La idea es aprovechar las columnas **title** y **description** las cuales fueron transformadas usando **TfidVectorizer** y la columna **projet** transformada anteriormente

In [None]:
X_train_combined_tfidf = np.hstack((X_train_title_desc_tfidf, X_train_project))
X_train_combined_tfidf

array([[0., 0., 0., ..., 0., 0., 7.],
       [0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 7.],
       [0., 0., 0., ..., 0., 0., 7.]])

# Pre-Procesameinto del set de prueba

In [None]:
X_test_title_desc_tfidf = vectorizer.transform(test_set['title'] + ' ' + test_set['description']).toarray()
X_test_project = label_encoder.transform(test_set['project']).reshape(-1, 1)


X_test_combined_tfidf = np.hstack((X_test_title_desc_tfidf, X_test_project))

# Entrenamiento del Modelo y Predicciones

In [None]:
model_tfidf = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.01, max_depth=10, random_state=84011)
model_tfidf.fit(X_train_combined_tfidf, Y_train)


y_test_pred = model_tfidf.predict(X_test_combined_tfidf)
print(y_test_pred)

[4.94674   2.8208344 2.5879304 ... 3.1588712 5.6264205 4.1928926]


In [None]:
y_test_pred_rounded = np.round(y_test_pred).astype(int)

In [None]:
results = pd.DataFrame({
    'id': test_set['id'],
    'storypoint': y_test_pred_rounded
})

results.head(15)

results.to_csv('results-tfidf.csv', index=False, header=True)

In [None]:
results.head(5)

Unnamed: 0,id,storypoint
0,3433,5
1,106,3
2,7182,3
3,8985,3
4,2149,3


## Método Bag of Words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer_bow = CountVectorizer(max_features=1000,stop_words='english')

In [None]:
X_train_title_desc_bow = vectorizer_bow.fit_transform(X_train_text['title'] + ' ' + X_train_text['description']).toarray()

In [None]:
X_train_combined_bow = np.hstack((X_train_title_desc_bow, X_train_project))
X_train_combined_bow

array([[0, 0, 0, ..., 0, 0, 7],
       [0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 7],
       [0, 0, 0, ..., 0, 0, 7]])

### Aplicamos el método de Bag of Words al set de test

In [None]:
X_test_title_desc_bow = vectorizer_bow.transform(test_set['title'] + ' ' + test_set['description']).toarray()
X_test_project = label_encoder.transform(test_set['project']).reshape(-1, 1)


X_test_combined_bow = np.hstack((X_test_title_desc_bow, X_test_project))

In [None]:
model_bow = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.01, max_depth=10, random_state=84011)
model_bow.fit(X_train_combined_bow, Y_train)


y_test_pred = model_bow.predict(X_test_combined_bow)
print(y_test_pred)

[4.7831683 3.1497746 2.5413637 ... 3.225056  5.429737  4.1173854]


In [None]:
y_test_pred_rounded = np.round(y_test_pred).astype(int)

In [None]:
y_test_pred_rounded

array([[3],
       [3],
       [4],
       ...,
       [3],
       [6],
       [5]])

In [None]:
results = pd.DataFrame({
    'id': test_set['id'],
    'storypoint': y_test_pred_rounded
})

results.head(15)

results.to_csv('results-bag-of-words.csv', index=False, header=True)

In [None]:
results.head(5)

Unnamed: 0,id,storypoint
0,3433,5
1,106,3
2,7182,3
3,8985,3
4,2149,3


## MODELO WORD EMBEDDINGS

### Pre - Procesamiento:



In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train_text['title'] + ' ' + X_train_text['description'])

X_train_seq = tokenizer.texts_to_sequences(X_train_text['title'] + ' ' + X_train_text['description'])
X_train_padded = pad_sequences(X_train_seq, maxlen=100, padding='post', truncating='post')

X_test_seq = tokenizer.texts_to_sequences(test_set['title'] + ' ' + test_set['description'])
X_test_padded = pad_sequences(X_test_seq, maxlen=100, padding='post', truncating='post')


In [None]:
embedding_index = {}
with open('glove.6B.100d.txt', 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefficients = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefficients


embedding_dim = 100
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None and embedding_vector.shape[0] == embedding_dim:
        embedding_matrix[i] = embedding_vector



In [None]:
model_relu_adam = Sequential()
model_relu_adam.add(Embedding(input_dim=len(tokenizer.word_index) + 1,
                    output_dim=100,
                    weights=[embedding_matrix],
                    input_length=100,
                    trainable=False))


model_relu_adam.add(Flatten())
model_relu_adam.add(Dense(64, activation='relu'))
model_relu_adam.add(Dense(32, activation='relu'))
model_relu_adam.add(Dense(1))


model_relu_adam.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse'])


model_relu_adam.fit(X_train_padded, Y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10




[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - loss: 9.2060 - mse: 9.2060 - val_loss: 7.8849 - val_mse: 7.8849
Epoch 2/10
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - loss: 6.4743 - mse: 6.4743 - val_loss: 8.0304 - val_mse: 8.0304
Epoch 3/10
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - loss: 3.5494 - mse: 3.5494 - val_loss: 8.5321 - val_mse: 8.5321
Epoch 4/10
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - loss: 1.9604 - mse: 1.9604 - val_loss: 9.0051 - val_mse: 9.0051
Epoch 5/10
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - loss: 1.2072 - mse: 1.2072 - val_loss: 9.4065 - val_mse: 9.4065
Epoch 6/10
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - loss: 1.0196 - mse: 1.0196 - val_loss: 9.3657 - val_mse: 9.3657
Epoch 7/10
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - loss

<keras.src.callbacks.history.History at 0x7893f521d630>

In [None]:
y_test_pred_relu_adam = model_relu_adam.predict(X_test_padded)

[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


In [None]:
y_test_pred_rounded = np.round(y_test_pred_relu_adam).astype(int)

In [None]:
results = pd.DataFrame({
    'id': test_set['id'],
    'storypoint': y_test_pred_rounded.flatten()
})

results.head(15)

results.to_csv('results-RNA-relu-adam.csv', index=False, header=True)



---



In [None]:
model_sigmoid_adam = Sequential()
model_sigmoid_adam.add(Embedding(input_dim=len(tokenizer.word_index) + 1,
                    output_dim=100,
                    weights=[embedding_matrix],
                    input_length=100,
                    trainable=False))

model_sigmoid_adam.add(Flatten())
model_sigmoid_adam.add(Dense(64, activation='sigmoid'))
model_sigmoid_adam.add(Dense(32, activation='sigmoid'))
model_sigmoid_adam.add(Dense(1))


model_sigmoid_adam.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse'])


model_sigmoid_adam.fit(X_train_padded, Y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10




[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step - loss: 13.9524 - mse: 13.9524 - val_loss: 8.6057 - val_mse: 8.6057
Epoch 2/10
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - loss: 9.2403 - mse: 9.2403 - val_loss: 8.1720 - val_mse: 8.1720
Epoch 3/10
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - loss: 7.8226 - mse: 7.8226 - val_loss: 7.7844 - val_mse: 7.7844
Epoch 4/10
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - loss: 6.9134 - mse: 6.9134 - val_loss: 7.9276 - val_mse: 7.9276
Epoch 5/10
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - loss: 5.1866 - mse: 5.1866 - val_loss: 8.6249 - val_mse: 8.6249
Epoch 6/10
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - loss: 3.7400 - mse: 3.7400 - val_loss: 8.7850 - val_mse: 8.7850
Epoch 7/10
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 18ms/step - lo

<keras.src.callbacks.history.History at 0x7893f53b9d50>

In [None]:
y_test_pred_sigmoid_adam = model_sigmoid_adam.predict(X_test_padded)

[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


In [None]:
y_test_pred_rounded = np.round(y_test_pred_sigmoid_adam).astype(int)

In [None]:
results = pd.DataFrame({
    'id': test_set['id'],
    'storypoint': y_test_pred_rounded.flatten()
})

results.head(15)

results.to_csv('results-RNA-sigmoid-adam.csv', index=False, header=True)



---



In [None]:
rmse_tfidf = -cross_val_score(model_tfidf, X_train_combined_tfidf, Y_train, scoring="neg_root_mean_squared_error", cv=5).mean()
print("RMSE para el modelo con TF-IDF:", rmse_tfidf)

RMSE para el modelo con TF-IDF: 2.7060457571498766


In [None]:
mse_tfidf = -cross_val_score(model_tfidf, X_train_combined_tfidf, Y_train, scoring="neg_mean_squared_error", cv=5).mean()
print("MSE para el modelo con TF-IDF:", mse_tfidf)

MSE para el modelo con TF-IDF: 7.34334553959246


In [None]:
rmse_bow = -cross_val_score(model_bow, X_train_combined_bow, Y_train, scoring="neg_root_mean_squared_error", cv=5).mean()
print("RMSE para el modelo con Bag of Words:", rmse_bow)

RMSE para el modelo con Bag of Words: 2.712511833654131


In [None]:
mse_bow = -cross_val_score(model_bow, X_train_combined_bow, Y_train, scoring="neg_mean_squared_error", cv=5).mean()
print("MSE para el modelo con Bag of Words:", mse_bow)

MSE para el modelo con Bag of Words: 7.374407903950749
