In [1]:
!pip install faiss-cpu
!pip install sentence_transformers



In [2]:
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import os
import torch

In [3]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available. Using:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU not available. Using CPU.")

GPU is available. Using: Tesla T4


In [4]:
big_data = pd.read_csv('/kaggle/input/dataset0/final_data.csv')

In [5]:
data = big_data[big_data['make'].str.lower().isin(['ford', 'toyota'])].reset_index(drop=True)
print(f"Filtered data contains {len(data)} rows.")

Filtered data contains 17135 rows.


In [6]:
data

Unnamed: 0,make,model,year,defect summary,consequence summary,corrective summary
0,FORD,FOCUS,2000,CERTAIN PASSENGER VEHICLES EQUIPPED WITH ZETEC...,"THIS, IN TURN, COULD CAUSE THE BATTERY CABLES ...",DEALERS WILL INSPECT THE BATTERY CABLES FOR TH...
1,FORD,FOCUS,2001,CERTAIN PASSENGER VEHICLES EQUIPPED WITH ZETEC...,"THIS, IN TURN, COULD CAUSE THE BATTERY CABLES ...",DEALERS WILL INSPECT THE BATTERY CABLES FOR TH...
2,FORD,CROWN VICTORIA,2002,"ON CERTAIN NATURAL GAS MODEL VEHICLES, A T-FIT...",A GAS LEAK OF SUFFICIENT QUANTITY CONCENTRATED...,"DEALERS WILL INSTALL A REDESIGNED ""T"" FITTING ..."
3,FORD,NAVIGATOR,2000,CERTAIN 4X2 SPORT UTILITY VEHICLES FAIL TO COM...,CUSTOMERS MAY INFLATE THEIR REAR TIRES BASED O...,A SUPPLEMENTARY LABEL WILL BE SENT TO CUSTOMER...
4,FORD,EXPEDITION,2000,CERTAIN 4X2 SPORT UTILITY VEHICLES FAIL TO COM...,CUSTOMERS MAY INFLATE THEIR REAR TIRES BASED O...,A SUPPLEMENTARY LABEL WILL BE SENT TO CUSTOMER...
...,...,...,...,...,...,...
17130,TOYOTA,RAV4,2024,Toyota Motor Engineering & Manufacturing (Toyo...,A damaged brake line can cause a brake fluid l...,Dealers will inspect and tighten the front bra...
17131,FORD,TRANSIT,2023,Ford Motor Company (Ford) is recalling certain...,Rear wheel lock-up or separation of the wheel ...,Dealers will inspect the rear axle shaft bolts...
17132,FORD,TRANSIT,2023,Ford Motor Company (Ford) is recalling certain...,Rear wheel lock-up or separation of the wheel ...,Dealers will inspect the rear axle shaft bolts...
17133,FORD,TRANSIT,2024,Ford Motor Company (Ford) is recalling certain...,Rear wheel lock-up or separation of the wheel ...,Dealers will inspect the rear axle shaft bolts...


In [7]:
data['combined_summary'] = (
    data['defect summary'] + " " +
    data['consequence summary'] + " " +
    data['corrective summary']
)

In [8]:
data

Unnamed: 0,make,model,year,defect summary,consequence summary,corrective summary,combined_summary
0,FORD,FOCUS,2000,CERTAIN PASSENGER VEHICLES EQUIPPED WITH ZETEC...,"THIS, IN TURN, COULD CAUSE THE BATTERY CABLES ...",DEALERS WILL INSPECT THE BATTERY CABLES FOR TH...,CERTAIN PASSENGER VEHICLES EQUIPPED WITH ZETEC...
1,FORD,FOCUS,2001,CERTAIN PASSENGER VEHICLES EQUIPPED WITH ZETEC...,"THIS, IN TURN, COULD CAUSE THE BATTERY CABLES ...",DEALERS WILL INSPECT THE BATTERY CABLES FOR TH...,CERTAIN PASSENGER VEHICLES EQUIPPED WITH ZETEC...
2,FORD,CROWN VICTORIA,2002,"ON CERTAIN NATURAL GAS MODEL VEHICLES, A T-FIT...",A GAS LEAK OF SUFFICIENT QUANTITY CONCENTRATED...,"DEALERS WILL INSTALL A REDESIGNED ""T"" FITTING ...","ON CERTAIN NATURAL GAS MODEL VEHICLES, A T-FIT..."
3,FORD,NAVIGATOR,2000,CERTAIN 4X2 SPORT UTILITY VEHICLES FAIL TO COM...,CUSTOMERS MAY INFLATE THEIR REAR TIRES BASED O...,A SUPPLEMENTARY LABEL WILL BE SENT TO CUSTOMER...,CERTAIN 4X2 SPORT UTILITY VEHICLES FAIL TO COM...
4,FORD,EXPEDITION,2000,CERTAIN 4X2 SPORT UTILITY VEHICLES FAIL TO COM...,CUSTOMERS MAY INFLATE THEIR REAR TIRES BASED O...,A SUPPLEMENTARY LABEL WILL BE SENT TO CUSTOMER...,CERTAIN 4X2 SPORT UTILITY VEHICLES FAIL TO COM...
...,...,...,...,...,...,...,...
17130,TOYOTA,RAV4,2024,Toyota Motor Engineering & Manufacturing (Toyo...,A damaged brake line can cause a brake fluid l...,Dealers will inspect and tighten the front bra...,Toyota Motor Engineering & Manufacturing (Toyo...
17131,FORD,TRANSIT,2023,Ford Motor Company (Ford) is recalling certain...,Rear wheel lock-up or separation of the wheel ...,Dealers will inspect the rear axle shaft bolts...,Ford Motor Company (Ford) is recalling certain...
17132,FORD,TRANSIT,2023,Ford Motor Company (Ford) is recalling certain...,Rear wheel lock-up or separation of the wheel ...,Dealers will inspect the rear axle shaft bolts...,Ford Motor Company (Ford) is recalling certain...
17133,FORD,TRANSIT,2024,Ford Motor Company (Ford) is recalling certain...,Rear wheel lock-up or separation of the wheel ...,Dealers will inspect the rear axle shaft bolts...,Ford Motor Company (Ford) is recalling certain...


In [9]:
print("Dataset with Merged Summaries:")
data[['make', 'model', 'year', 'combined_summary']]

Dataset with Merged Summaries:


Unnamed: 0,make,model,year,combined_summary
0,FORD,FOCUS,2000,CERTAIN PASSENGER VEHICLES EQUIPPED WITH ZETEC...
1,FORD,FOCUS,2001,CERTAIN PASSENGER VEHICLES EQUIPPED WITH ZETEC...
2,FORD,CROWN VICTORIA,2002,"ON CERTAIN NATURAL GAS MODEL VEHICLES, A T-FIT..."
3,FORD,NAVIGATOR,2000,CERTAIN 4X2 SPORT UTILITY VEHICLES FAIL TO COM...
4,FORD,EXPEDITION,2000,CERTAIN 4X2 SPORT UTILITY VEHICLES FAIL TO COM...
...,...,...,...,...
17130,TOYOTA,RAV4,2024,Toyota Motor Engineering & Manufacturing (Toyo...
17131,FORD,TRANSIT,2023,Ford Motor Company (Ford) is recalling certain...
17132,FORD,TRANSIT,2023,Ford Motor Company (Ford) is recalling certain...
17133,FORD,TRANSIT,2024,Ford Motor Company (Ford) is recalling certain...


In [10]:
input_example = {
    'make': 'ford',
    'model': 'focus',
    'year': '2000',
    'issue': 'stuck throttle risk'
}

In [11]:
filtered_data = data[
    (data['make'].str.upper() == input_example['make'].upper()) &
    (data['model'].str.upper() == input_example['model'].upper()) &
    (data['year'] == int(input_example['year']))
]

In [12]:
filtered_data

Unnamed: 0,make,model,year,defect summary,consequence summary,corrective summary,combined_summary
0,FORD,FOCUS,2000,CERTAIN PASSENGER VEHICLES EQUIPPED WITH ZETEC...,"THIS, IN TURN, COULD CAUSE THE BATTERY CABLES ...",DEALERS WILL INSPECT THE BATTERY CABLES FOR TH...,CERTAIN PASSENGER VEHICLES EQUIPPED WITH ZETEC...
12,FORD,FOCUS,2000,CERTAIN PASSENGER VEHICLES HAVE LOOSE PINCH BO...,"LOOSE PINCH BOLTS COULD EXHIBIT NOISE, VIBRATI...",DEALERS WILL INSPECT THE STEERING KNUCKLE PINC...,CERTAIN PASSENGER VEHICLES HAVE LOOSE PINCH BO...
62,FORD,FOCUS,2000,"VEHICLE DESCRIPTION: PASSENGER CARS, SPORT UT...",LOSS OF VISIBILITY WHILE DRIVING INCREASES THE...,DEALERS WILL INSPECT THE DATE CODE ON THE WIPE...,"VEHICLE DESCRIPTION: PASSENGER CARS, SPORT UT..."
179,FORD,FOCUS,2000,THIS IS NOT A SAFETY RECALL IN ACCORDANCE WITH...,THE WHEEL BEARINGS COULD PREMATURELY WEAR AND ...,DEALERS WILL REPLACE THE REAR WHEEL BEARINGS A...,THIS IS NOT A SAFETY RECALL IN ACCORDANCE WITH...
225,FORD,FOCUS,2000,VEHICLE DESCRIPTION: STATION WAGONS AND 3-DOO...,IF THE SEAT IS THEN RETURNED TO THE UPRIGHT PO...,DEALERS WILL INSTALL A REVISED SEAT BACK HINGE...,VEHICLE DESCRIPTION: STATION WAGONS AND 3-DOO...
227,FORD,FOCUS,2000,VEHICLE DESCRIPTION: PASSENGER VEHICLES. A ...,THIS COULD RESULT IN EITHER A LOSS OF INDIVIDU...,DEALERS WILL REPLACE THE DECKLID WIRE HARNESS.,VEHICLE DESCRIPTION: PASSENGER VEHICLES. A ...
393,FORD,FOCUS,2000,VEHICLE DESCRIPTION: PASSENGER VEHICLES. CER...,"IN THE EVENT OF A CRASH, AN OCCUPANT CONTACTIN...",DEALERS WILL REMOVE THE LEFT AND RIGHT SIDE B-...,VEHICLE DESCRIPTION: PASSENGER VEHICLES. CER...
394,FORD,FOCUS,2000,VEHICLE DESCRIPTION: PASSENGER VEHICLES EQUIP...,A THROTTLE THAT DOES NOT RETURN TO IDLE COULD ...,"DEALERS WILL INSPECT THESE VEHICLES AND, IF NE...",VEHICLE DESCRIPTION: PASSENGER VEHICLES EQUIP...
395,FORD,FOCUS,2000,VEHICLE DESCRIPTION: PASSENGER VEHICLES. THE ...,THIS CONDITION COULD RESULT IN A VEHICLE CRASH.,DEALERS WILL INSPECT THE REAR WHEEL BEARINGS A...,VEHICLE DESCRIPTION: PASSENGER VEHICLES. THE ...
399,FORD,FOCUS,2000,VEHICLE DESCRIPTION: PASSENGER VEHICLES EQUIP...,A THROTTLE THAT DOES NOT RETURN TO IDLE COULD ...,DEALERS WILL REPLACE THE SPEED CONTROL CABLE (...,VEHICLE DESCRIPTION: PASSENGER VEHICLES EQUIP...


In [13]:
if filtered_data.empty:
    print("\nNo matching documents found for the input criteria.")
else:
    print("\nFiltered Data Based on Input Criteria:")
    print(filtered_data[['make', 'model', 'year', 'combined_summary']])


Filtered Data Based on Input Criteria:
      make  model  year                                   combined_summary
0     FORD  FOCUS  2000  CERTAIN PASSENGER VEHICLES EQUIPPED WITH ZETEC...
12    FORD  FOCUS  2000  CERTAIN PASSENGER VEHICLES HAVE LOOSE PINCH BO...
62    FORD  FOCUS  2000  VEHICLE DESCRIPTION:  PASSENGER CARS, SPORT UT...
179   FORD  FOCUS  2000  THIS IS NOT A SAFETY RECALL IN ACCORDANCE WITH...
225   FORD  FOCUS  2000  VEHICLE DESCRIPTION:  STATION WAGONS AND 3-DOO...
227   FORD  FOCUS  2000  VEHICLE DESCRIPTION:  PASSENGER VEHICLES.   A ...
393   FORD  FOCUS  2000  VEHICLE DESCRIPTION:  PASSENGER VEHICLES.  CER...
394   FORD  FOCUS  2000  VEHICLE DESCRIPTION:  PASSENGER VEHICLES EQUIP...
395   FORD  FOCUS  2000  VEHICLE DESCRIPTION:  PASSENGER VEHICLES. THE ...
399   FORD  FOCUS  2000  VEHICLE DESCRIPTION:  PASSENGER VEHICLES EQUIP...
2351  FORD  FOCUS  2000  ON CERTAIN PASSENGER VEHICLES, CONTAMINATION O...
3201  FORD  FOCUS  2000  CERTAIN CK MOTORSPORTS COMBINATION 

In [14]:
filtered_data.shape

(18, 7)

In [15]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)



In [16]:
if not filtered_data.empty:
    document_embeddings = embedding_model.encode(filtered_data['combined_summary'].tolist(), device=device)
    print(document_embeddings)
else:
    document_embeddings = None

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[[-0.09328569  0.0171909   0.04776169 ... -0.00439931  0.04448591
  -0.09088497]
 [-0.01966354 -0.0754262   0.03578025 ...  0.05053136 -0.05591796
   0.01465171]
 [-0.07828142 -0.02445234  0.14816457 ...  0.00134399 -0.01780413
   0.03967603]
 ...
 [-0.03099543 -0.04579483  0.01685693 ... -0.0437853  -0.02845422
  -0.00657945]
 [-0.07911659  0.00843505  0.06472374 ... -0.04359849  0.01419516
   0.02107595]
 [-0.06587941 -0.07354918  0.06031438 ... -0.00083293  0.04257085
  -0.0372031 ]]


In [17]:
document_embeddings.shape

(18, 384)

In [18]:
if document_embeddings is not None:
    embedding_dim = document_embeddings.shape[1]
    print(f"\nEmbedding Dimension: {embedding_dim}")
    index = faiss.IndexFlatL2(embedding_dim)
    index.add(np.array(document_embeddings))
    print(index.ntotal)


Embedding Dimension: 384
18


In [19]:
issue_embedding = embedding_model.encode(input_example['issue'], device=device)
print(issue_embedding)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[-1.19805736e-02  2.96361577e-02 -1.29987756e-02  2.26521697e-02
  3.42247412e-02 -1.86177101e-02  4.46575694e-02  1.92646135e-03
 -4.54316065e-02  3.60787362e-02  5.54559603e-02  3.12152524e-02
  8.75129458e-03 -5.43059148e-02 -7.43814260e-02  5.98456822e-02
  7.00556263e-02 -1.73693746e-02 -6.04536645e-02  4.97462377e-02
 -9.59969684e-03  3.31528448e-02 -2.48244908e-02  7.71136507e-02
 -4.68126200e-02  5.65521121e-02 -6.45181313e-02  7.82444924e-02
  3.87308584e-03 -1.07300036e-01 -4.46805954e-02 -1.31476391e-02
 -3.14624305e-03 -7.80154839e-02 -1.35956155e-02 -3.00145335e-03
 -4.78819683e-02 -2.40856670e-02  4.78049181e-02 -6.05333177e-03
  3.91277708e-02 -4.48954217e-02 -2.56041866e-02 -1.98283102e-02
  2.83316225e-02  3.45786009e-03  5.71363010e-02  3.67052481e-02
  5.58182523e-02 -6.95988685e-02 -4.89601083e-02 -3.85777391e-02
  6.83136806e-02 -7.43729845e-02  4.55293991e-02 -2.74302736e-02
  2.86592580e-02  4.54128124e-02  1.53832762e-02 -1.32169407e-02
 -1.10709574e-03 -2.53830

In [20]:
if document_embeddings is not None:
    k = 3
    distances, indices = index.search(np.array([issue_embedding]), k)
    print(distances, indices)
    top_k_docs = filtered_data.iloc[indices[0]]
    print("\nTop K Relevant Documents Retrieved:")
    print(top_k_docs['combined_summary'])
else:
    top_k_docs = None

[[1.2265592 1.2773381 1.300019 ]] [[ 9  7 10]]

Top K Relevant Documents Retrieved:
399     VEHICLE DESCRIPTION:  PASSENGER VEHICLES EQUIP...
394     VEHICLE DESCRIPTION:  PASSENGER VEHICLES EQUIP...
2351    ON CERTAIN PASSENGER VEHICLES, CONTAMINATION O...
Name: combined_summary, dtype: object


In [21]:
!pip install  -U bitsandbytes

  pid, fd = os.forkpty()




In [22]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "meta-llama/Llama-3.1-8B"
hf_token = "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" 

tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_token)
model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=hf_token, load_in_4bit=True,
    device_map="auto")

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [23]:
max_input_length = model.config.max_position_embeddings
max_input_length

131072

In [24]:
combined_text = " ".join(top_k_docs['combined_summary'].tolist())
combined_text


'VEHICLE DESCRIPTION:  PASSENGER VEHICLES EQUIPPED WITH SPEED CONTROL.  THE SPEED CONTROL CABLE END FITTING CAN ALLOW WATER TO ENTER THE SPEED CONTROL SERVO ASSEMBLY.  IF THIS OCCURS, CORROSION IN THE SERVO ASSEMBLY COULD DEVELOP AND CAUSE INTERMITTENT SPEED CONTROL OPERATION OR PREVENT THE THROTTLE FROM RETURNING TO IDLE. A THROTTLE THAT DOES NOT RETURN TO IDLE COULD RESULT IN UNEXPECTED ACCELERATION, INCREASING THE RISK OF A CRASH. DEALERS WILL REPLACE THE SPEED CONTROL CABLE (WITH A NEW CABLE THAT INCORPORATES A GLASS-FILLED END CAP) AND INSPECT THE ASSEMBLY FOR INTERNAL CORROSION.  SERVOS WITH EVIDENCE OF CORROSION WILL ALSO BE REPLACED.  OWNERS ARE ADVISED NOT TO USE THE SPEED CONTROL UNTIL THE VEHICLES ARE REPAIRED. VEHICLE DESCRIPTION:  PASSENGER VEHICLES EQUIPPED WITH SPEED CONTROL AND AUTOMATIC TRANSMISSION.  A SPEED CONTROL CABLE COULD HAVE A CORE WIRE THAT IS LONG ENOUGH TO CATCH ON THE SLEEVE AT THE THROTTLE BODY END OF THE CABLE DURING WIDE-OPEN THROTTLE ACCELERATION. A TH

In [25]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [38]:
prompt = f"Summarize the following text:\n{combined_text}\n\nSummary:"
# prompt = f"""
# You are a technical summarizer for vehicle recall data. The following text outlines a recall for some vehicles. 
# Your task is to summarize the vehicle's defect, the consequences, and the corrective measures taken:
# {combined_text}

# Provide a summary that includes the defect, consequences, and solutions, while excluding redundant details.
# """


inputs = tokenizer(prompt, return_tensors="pt", max_length=max_input_length, truncation=True, padding=True).to("cuda")
outputs = model.generate(**inputs, max_new_tokens=300, temperature=0.7, pad_token_id=tokenizer.eos_token_id,
                        top_k = 50, top_p = 0.95, repetition_penalty=1.2)

summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(summary)

Summarize the following text:
VEHICLE DESCRIPTION:  PASSENGER VEHICLES EQUIPPED WITH SPEED CONTROL.  THE SPEED CONTROL CABLE END FITTING CAN ALLOW WATER TO ENTER THE SPEED CONTROL SERVO ASSEMBLY.  IF THIS OCCURS, CORROSION IN THE SERVO ASSEMBLY COULD DEVELOP AND CAUSE INTERMITTENT SPEED CONTROL OPERATION OR PREVENT THE THROTTLE FROM RETURNING TO IDLE. A THROTTLE THAT DOES NOT RETURN TO IDLE COULD RESULT IN UNEXPECTED ACCELERATION, INCREASING THE RISK OF A CRASH. DEALERS WILL REPLACE THE SPEED CONTROL CABLE (WITH A NEW CABLE THAT INCORPORATES A GLASS-FILLED END CAP) AND INSPECT THE ASSEMBLY FOR INTERNAL CORROSION.  SERVOS WITH EVIDENCE OF CORROSION WILL ALSO BE REPLACED.  OWNERS ARE ADVISED NOT TO USE THE SPEED CONTROL UNTIL THE VEHICLES ARE REPAIRED. VEHICLE DESCRIPTION:  PASSENGER VEHICLES EQUIPPED WITH SPEED CONTROL AND AUTOMATIC TRANSMISSION.  A SPEED CONTROL CABLE COULD HAVE A CORE WIRE THAT IS LONG ENOUGH TO CATCH ON THE SLEEVE AT THE THROTTLE BODY END OF THE CABLE DURING WIDE-OPE