In [1]:
#Library

import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer, precision_score

from hazm import Normalizer, word_tokenize, stopwords_list
import re





Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [60]:
#Import files
train_df = pd.read_csv('data_train.csv')

# pre-processing

Hazm dictionary has been used for pre-processeing

In [62]:
comments = train_df['comment']


In [63]:
def clean_text(text):
    # Remove any special characters and symbols
    text = re.sub(r'[^\w\s]', '', text)
    return text

comments = comments.apply(clean_text)

In [64]:
# Step 2: Tokenization and Normalization
normalizer = Normalizer()
comments = comments.apply(normalizer.normalize)
comments = comments.apply(word_tokenize)

In [65]:
# Step 3: Stopword Removal
stop_words = set(stopwords_list())
comments = comments.apply(lambda x: [word for word in x if word not in stop_words])

In [66]:
train_df['comment'] = comments

In [67]:
# Join the tokenized words into sentences, otherwise models can not work
train_df['comment'] = train_df['comment'].apply(lambda x: ' '.join(x))

As last column is Str, we need to ecnode it. 

In [72]:
# Create columns for each intent class and set initial value to 0
new_df = pd.DataFrame(train_df)


intent_classes = ['Class 1', 'Class 2', 'Class 3', 'Class 4', 'Class 5']
for class_label in intent_classes:
    new_df[class_label] = 0

# Update the columns based on the intent values
for index, row in new_df.iterrows():
    intents = row['intent'].split(',')
    for intent in intents:
        new_df.at[index, f'Class {intent}'] = 1

# Drop the original intent column
new_df.drop(columns=['intent'], inplace=True)

train_df = new_df

# CNN

In [95]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense
from keras.metrics import Precision



In [76]:
df = train_df
X = df['comment'].values
y = df[['Class 1', 'Class 2', 'Class 3', 'Class 4', 'Class 5']].values


In [81]:
# Step 2: Tokenization
max_words = 10000  # Adjust as needed
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)

In [93]:
# Step 3: Padding
maxlen = 100  # Adjust as needed
X_pad = pad_sequences(X_seq, maxlen=maxlen)

In [94]:
# Step 4: Define the CNN architecture
embedding_dim = 100  # Adjust as needed
vocab_size = len(tokenizer.word_index) + 1
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(5, activation='sigmoid'))  # Sigmoid activation for multi-label classification


In [96]:
# Step 5: Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[Precision()])


In [97]:
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)


In [104]:
# Step 6: Train the model
model.fit(X_train, y_train, epochs=2, batch_size=40, validation_split=0.1)


Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x137d86fdf90>

In [105]:
# Step 7: Evaluate the model
loss, precision = model.evaluate(X_test, y_test)
print("Test Precision:", precision)

Test Precision: 0.8464247584342957


# predict

In [108]:
test_df = pd.read_csv('data_test_users.csv')

Unnamed: 0,id,comment
0,15336956,خیلی خوبه عالیه
1,15336959,زیبا بود
2,15336960,به علت شکیتگی مرجوع کردم
3,15336961,هم جعبه ماوس باز شده بود و هم ماوس شکسته بود
4,15336964,چراغ قوه اش خوب بود
...,...,...
8995,14426574,من سایز ۱۸ تا ۲۴ ماه سفارش دادم اما وقتی رسید ...
8996,14426637,ولی این بار برای من اشتباه ارسال شده بود
8997,14426701,متاسفاته من مهتابی سفارش داده بودم ولی برام آف...
8998,14426742,دقیقاهمین چیزی که داخل عکسه ر


In [109]:
test_comments = test_df['comment']

In [110]:
test_comments = test_comments.apply(clean_text)
# Step 2: Tokenization and Normalization
test_comments = test_comments.apply(normalizer.normalize)
test_comments = test_comments.apply(word_tokenize)

In [111]:
# Step 3: Stopword Removal
test_comments = test_comments.apply(lambda x: [word for word in x if word not in stop_words])
test_df['comment'] = test_comments

In [113]:
# Join the tokenized words into sentences, otherwise models can not work
test_df['comment'] = test_df['comment'].apply(lambda x: ' '.join(x))

# Create columns for each intent class and set initial value to 0
new_df_test = pd.DataFrame(test_df)


In [116]:
# Step 1: Preprocess the test data
test_data = new_df_test['comment'].values  # assuming df_test is your test dataframe
test_data_seq = tokenizer.texts_to_sequences(test_data)
test_data_pad = pad_sequences(test_data_seq, maxlen=maxlen)

In [117]:
# Step 2: Use the trained model to predict the labels for the test data
predictions_test = model.predict(test_data_pad)



In [124]:
# Step 3: Convert the predicted probabilities into binary classes based on a threshold
threshold = 0.5
binary_predictions_test = (predictions_test > threshold).astype(int)

In [126]:
# Step 4: Display or further analyze the predicted classes
print("Predicted classes for the test data:")
print(binary_predictions_test)

Predicted classes for the test data:
[[1 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 ...
 [0 0 0 1 0]
 [0 0 0 1 0]
 [1 0 0 0 0]]


In [127]:
# In order to avoid mistake, we again make a data frame from test data.
test_df_original = pd.read_csv('data_test_users.csv')

In [129]:
# Convert the binary predictions to DataFrame
predictions_df = pd.DataFrame(binary_predictions_test, columns=['Class 1', 'Class 2', 'Class 3', 'Class 4', 'Class 5'])

# Concatenate the predictions DataFrame with the original test DataFrame
df_test_final = pd.concat([test_df_original, predictions_df], axis=1)

            id                                            comment  Class 1  \
0     15336956                                    خیلی خوبه عالیه        1   
1     15336959                                           زیبا بود        0   
2     15336960                           به علت شکیتگی مرجوع کردم        0   
3     15336961       هم جعبه ماوس باز شده بود و هم ماوس شکسته بود        1   
4     15336964                                چراغ قوه اش خوب بود        0   
...        ...                                                ...      ...   
8995  14426574  من سایز ۱۸ تا ۲۴ ماه سفارش دادم اما وقتی رسید ...        0   
8996  14426637           ولی این بار برای من اشتباه ارسال شده بود        0   
8997  14426701  متاسفاته من مهتابی سفارش داده بودم ولی برام آف...        0   
8998  14426742                      دقیقاهمین چیزی که داخل عکسه ر        0   
8999  14426779  همین الان به دستم رسید . برخلاف کیف های دیگه ک...        1   

      Class 2  Class 3  Class 4  Class 5  
0           0       

In [131]:
intent_list = []
for index, row in df_test_final.iterrows():
    intent = ','.join([str(i+1) for i, val in enumerate(row[['Class 1', 'Class 2', 'Class 3', 'Class 4', 'Class 5']]) if val == 1])
    intent_list.append(intent)

# Add the 'intent' column to the dataframe
df_test_final['intent'] = intent_list

In [135]:
# Drop the individual class columns
df_test_final.drop(['Class 1', 'Class 2', 'Class 3', 'Class 4', 'Class 5'], axis=1, inplace=True)

KeyError: "['Class 1', 'Class 2', 'Class 3', 'Class 4', 'Class 5'] not found in axis"

In [137]:
df_test_final.to_csv("df_test_final.csv", index=False)