In [1]:
import zipfile

# Check the contents of the ZIP file
with zipfile.ZipFile('C:\\Users\\user\\Downloads\\complaints.zip', 'r') as zip_ref:
    print(zip_ref.namelist())  # This will print the list of files in the zip archive


['complaints/', 'complaints/consumercomplaints.csv', '__MACOSX/complaints/._consumercomplaints.csv']


In [2]:
import zipfile
import pandas as pd

# Corrected path inside the ZIP file
with zipfile.ZipFile('C:\\Users\\user\\Downloads\\complaints.zip', 'r') as zip_ref:
    with zip_ref.open('complaints/consumercomplaints.csv') as csv_file:
        data = pd.read_csv(csv_file)

# Display the first few rows of the DataFrame
print(data.head())


   Unnamed: 0 Date received  \
0           0    2022-11-11   
1           1    2022-11-23   
2           2    2022-11-16   
3           3    2022-11-15   
4           4    2022-11-07   

                                             Product  \
0                                           Mortgage   
1  Credit reporting, credit repair services, or o...   
2                                           Mortgage   
3                        Checking or savings account   
4                                           Mortgage   

                  Sub-product                           Issue  \
0  Conventional home mortgage  Trouble during payment process   
1            Credit reporting     Improper use of your report   
2                 VA mortgage  Trouble during payment process   
3            Checking account             Managing an account   
4      Other type of mortgage  Trouble during payment process   

                                       Sub-issue  \
0                                

In [3]:
data = data.drop("Unnamed: 0",axis=1)

In [4]:
print(data.isnull().sum())

Date received                         0
Product                               0
Sub-product                      235294
Issue                                 0
Sub-issue                        683355
Consumer complaint narrative    1987977
dtype: int64


In [5]:
data = data.dropna()

In [6]:
print(data["Product"].value_counts())

Product
Credit reporting, credit repair services, or other personal consumer reports    507582
Debt collection                                                                 192045
Credit card or prepaid card                                                      80410
Checking or savings account                                                      54192
Student loan                                                                     32697
Vehicle loan or lease                                                            19874
Payday loan, title loan, or personal loan                                         1008
Name: count, dtype: int64


In [7]:
pip install nltk




In [1]:
import zipfile
import pandas as pd
import nltk
import re
import string
from nltk.corpus import stopwords

# Download stopwords
nltk.download('stopwords')

# Load the CSV file from the ZIP archive
with zipfile.ZipFile('C:\\Users\\user\\Downloads\\complaints.zip', 'r') as zip_ref:
    with zip_ref.open('complaints/consumercomplaints.csv') as csv_file:
        data = pd.read_csv(csv_file)

# Initialize the stemmer and stopwords
stemmer = nltk.SnowballStemmer("english")
stopword = set(stopwords.words('english'))

# Define the clean function
def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)  # Remove text in brackets
    text = re.sub('https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub('<.*?>+', '', text)  # Remove HTML tags
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)  # Remove punctuation
    text = re.sub('\n', '', text)  # Remove new lines
    text = re.sub('\w*\d\w*', '', text)  # Remove words containing numbers
    text = [word for word in text.split() if word not in stopword]  # Remove stopwords
    text = " ".join(text)
    text = [stemmer.stem(word) for word in text.split()]  # Apply stemming
    text = " ".join(text)
    return text

# Apply the clean function to the "Consumer complaint narrative" column
if "Consumer complaint narrative" in data.columns:
    data["Consumer complaint narrative"] = data["Consumer complaint narrative"].apply(clean)

# Display the first few rows
print(data.head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


   Unnamed: 0 Date received  \
0           0    2022-11-11   
1           1    2022-11-23   
2           2    2022-11-16   
3           3    2022-11-15   
4           4    2022-11-07   

                                             Product  \
0                                           Mortgage   
1  Credit reporting, credit repair services, or o...   
2                                           Mortgage   
3                        Checking or savings account   
4                                           Mortgage   

                  Sub-product                           Issue  \
0  Conventional home mortgage  Trouble during payment process   
1            Credit reporting     Improper use of your report   
2                 VA mortgage  Trouble during payment process   
3            Checking account             Managing an account   
4      Other type of mortgage  Trouble during payment process   

                                       Sub-issue  \
0                                

In [3]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

# Selecting the relevant columns
data = data[["Consumer complaint narrative", "Product"]]

# Converting the data to numpy arrays
x = np.array(data["Consumer complaint narrative"])
y = np.array(data["Product"])

# Vectorizing the text data
cv = CountVectorizer()
X = cv.fit_transform(x)

# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.33, 
                                                    random_state=42)

# Confirm shapes of the training and test sets
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(2078319, 228534) (1023650, 228534) (2078319,) (1023650,)


In [5]:
from sklearn.linear_model import SGDClassifier

# Initialize the model
sgdmodel = SGDClassifier()

# Train the model
sgdmodel.fit(X_train, y_train)

In [None]:
user = input("Enter a Text: ")
data = cv.transform([user]).toarray()
output = sgdmodel.predict(data)
print(output)

In [None]:
user = input("Enter a Text: ")
data = cv.transform([user]).toarray()
output = sgdmodel.predict(data)
print(output)