In [None]:
# Importing libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
import nltk
import re
from nltk.corpus import stopwords
import string

In [None]:
# Importing dataset
data = pd.read_csv("consumercomplaints.csv")
data.head()

In [None]:
# The dataset contains an Unnamed column. I’ll remove the column and move further:
data = data.drop("Unnamed: 0",axis=1)

In [None]:
#  Having a look if the dataset contains null values or not:
print(data.isnull().sum())

In [None]:
# The dataset contains so many null values. I’ll drop all the rows containing null values and move further:
data = data.dropna()

In [None]:
# The product column in the dataset contains the labels. 
# Here the labels represent the nature of the complaints reported by the consumers. 
# Let’s have a look at all the labels and their frequency:

print(data["Product"].value_counts())

The consumer complaint narrative column contains the complete description of the complaints reported by the consumers. I will clean and prepare this column before using it in a Machine Learning model

In [None]:
# Download stopwords
nltk.download('stopwords')


stemmer = nltk.SnowballStemmer("english")
stopword=set(stopwords.words('english'))


def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)  # Corrected line
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text = " ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text = " ".join(text)
    return text

data["Consumer complaint narrative"] = data["Consumer complaint narrative"].apply(clean)


In [None]:
# Setting the dependent and independent

data = data[["Consumer complaint narrative", "Product"]]


x = np.array(data["Consumer complaint narrative"])
y = np.array(data["Product"])

In [None]:
# Creating the model
cv = CountVectorizer()
x = cv.fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [None]:
# Training the Machine Learning model using the Stochastic Gradient Descent classification algorithm
sgdmodel = SGDClassifier()
sgdmodel.fit(X_train,y_train)

In [None]:
# Use the trained model to make predictions
user = input("Enter a Text: ")
data = cv.transform([user]).toarray()
output = sgdmodel.predict(data)
print(output)