# Data Preprocessing

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn import metrics
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

/kaggle/input/legal-citation-text-classification/legal_text_classification.csv


In [3]:
df = pd.read_csv('/kaggle/input/legal-citation-text-classification/legal_text_classification.csv')
df.shape

df1 = df[['case_outcome', 'case_text']].copy()
df1 = df1[pd.notnull(df1['case_text'])]
df1.columns = ['Outcome', 'Text'] 
print("Processed DataFrame Shape:", df1.shape)

Processed DataFrame Shape: (24809, 2)


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Assuming df1 is the correct DataFrame
# Split the data into train, valid, and test sets
train_data, valid_test_data = train_test_split(df1, test_size=0.3)
valid_data, test_data = train_test_split(valid_test_data, test_size=0.66)

# tf-idf
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10,
                        ngram_range=(2, 2), 
                        max_features = 1000,
                        stop_words='english')

label_encoder = LabelEncoder()
train_features = tfidf.fit_transform(train_data.Text).toarray()
train_labels = label_encoder.fit_transform(train_data['Outcome'])

test_features = tfidf.fit_transform(test_data.Text).toarray()
test_labels = label_encoder.fit_transform(test_data['Outcome'])


# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['Text'])

X_train = tokenizer.texts_to_sequences(train_data['Text'])
X_valid = tokenizer.texts_to_sequences(valid_data['Text'])
X_test = tokenizer.texts_to_sequences(test_data['Text'])

# Pad sequences to ensure consistent length
max_length = max(len(seq) for seq in X_train)
X_train = pad_sequences(X_train, maxlen=max_length, padding='post')
X_valid = pad_sequences(X_valid, maxlen=max_length, padding='post')
X_test = pad_sequences(X_test, maxlen=max_length, padding='post')


y_train = label_encoder.fit_transform(train_data['Outcome'])
y_valid = label_encoder.fit_transform(valid_data['Outcome'])
y_test = label_encoder.transform(test_data['Outcome'])

dic = {}
for a,b in zip(test_labels, label_encoder.inverse_transform(test_labels)):
    dic[a] = b
target_names_all = [item[1] for item in sorted(dic.items())]

# RandomForestClassifier

In [5]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=10, random_state=1)
model.fit(train_features, train_labels)
test_pred = model.predict(test_features)
print("Classification Report:")
print(metrics.classification_report(test_labels, test_pred, target_names= target_names_all))

Classification Report:
               precision    recall  f1-score   support

     affirmed       0.00      0.00      0.00        22
      applied       0.14      0.04      0.06       499
     approved       0.00      0.00      0.00        22
        cited       0.48      0.82      0.60      2363
   considered       0.04      0.01      0.02       318
    discussed       0.08      0.01      0.01       188
distinguished       0.00      0.00      0.00       116
     followed       0.08      0.02      0.03       465
  referred to       0.19      0.11      0.14       898
      related       0.00      0.00      0.00        22

     accuracy                           0.42      4913
    macro avg       0.10      0.10      0.09      4913
 weighted avg       0.29      0.42      0.33      4913



  _warn_prf(average, modifier, msg_start, len(result))


# LinearSVC

In [7]:
from sklearn.svm import LinearSVC

model = LinearSVC()
model.fit(train_features, train_labels)
test_pred = model.predict(test_features)
print("Classification Report:")
print(metrics.classification_report(test_labels, test_pred, target_names= target_names_all))

Classification Report:
               precision    recall  f1-score   support

     affirmed       0.00      0.00      0.00        22
      applied       0.11      0.03      0.05       499
     approved       0.00      0.00      0.00        22
        cited       0.48      0.78      0.60      2363
   considered       0.07      0.03      0.04       318
    discussed       0.05      0.01      0.02       188
distinguished       0.07      0.01      0.02       116
     followed       0.12      0.05      0.07       465
  referred to       0.21      0.13      0.16       898
      related       0.00      0.00      0.00        22

     accuracy                           0.41      4913
    macro avg       0.11      0.10      0.10      4913
 weighted avg       0.30      0.41      0.33      4913



  _warn_prf(average, modifier, msg_start, len(result))


# MLPClassifier

In [9]:
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(hidden_layer_sizes=(300,), max_iter=10, random_state=1)
model.fit(train_features, train_labels)
test_pred = model.predict(test_features)
print("Classification Report:")
print(metrics.classification_report(test_labels, test_pred, target_names= target_names_all))

Classification Report:
               precision    recall  f1-score   support

     affirmed       0.00      0.00      0.00        22
      applied       0.11      0.05      0.07       499
     approved       0.00      0.00      0.00        22
        cited       0.49      0.79      0.60      2363
   considered       0.07      0.03      0.04       318
    discussed       0.03      0.01      0.02       188
distinguished       0.20      0.01      0.02       116
     followed       0.12      0.04      0.06       465
  referred to       0.23      0.14      0.17       898
      related       0.00      0.00      0.00        22

     accuracy                           0.41      4913
    macro avg       0.13      0.11      0.10      4913
 weighted avg       0.31      0.41      0.34      4913



  _warn_prf(average, modifier, msg_start, len(result))


# Conv1D

In [10]:
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense

# Define the CONV1D model
model_conv1d = Sequential()
model_conv1d.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_length))
model_conv1d.add(Conv1D(128, 5, activation='relu'))
model_conv1d.add(GlobalMaxPooling1D())
model_conv1d.add(Dense(10, activation='softmax'))

# Compile the CONV1D model
model_conv1d.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the CONV1D model with GPU
model_conv1d.fit(X_train, y_train, epochs=10, batch_size=128, validation_data=(X_valid, y_valid))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7a5f660ca510>

In [11]:
from sklearn import metrics
y_pred = np.argmax(np.round(model_conv1d.predict(X_test)),axis = 1)
print("Classification Report:")
print(metrics.classification_report(y_test, y_pred, target_names=target_names_all))


Classification Report:
               precision    recall  f1-score   support

     affirmed       0.01      0.50      0.03        22
      applied       0.40      0.25      0.31       499
     approved       0.25      0.05      0.08        22
        cited       0.69      0.74      0.72      2363
   considered       0.51      0.26      0.35       318
    discussed       0.45      0.27      0.33       188
distinguished       0.70      0.34      0.45       116
     followed       0.63      0.34      0.44       465
  referred to       0.70      0.47      0.56       898
      related       0.89      0.36      0.52        22

     accuracy                           0.54      4913
    macro avg       0.52      0.36      0.38      4913
 weighted avg       0.63      0.54      0.57      4913

