In [10]:
# Core
import numpy as np
import pandas as pd

# NLP
import nltk
import re
from sklearn.feature_extraction.text import TfidfVectorizer

# ML
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Audio
import librosa

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Download NLP resources
nltk.download('stopwords')
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
#dataset
data = {
    "text": [
        # ---------- TRUTH-LIKE STATEMENTS (0) ----------
        "I was at home watching television the entire evening",
        "I went to the office and completed my regular tasks",
        "I clearly remember submitting the report on time",
        "I did not take anything from the desk",
        "I was with my friends at the cafe",
        "I followed the instructions exactly as given",
        "I arrived at the location around 9 AM",
        "I completed the work before the deadline",
        "I attended the meeting as scheduled",
        "I left the building after finishing my shift",
        "I paid for the item using my card",
        "I informed my supervisor about the issue",
        "I stayed at home due to illness",
        "I locked the door before leaving",
        "I spoke honestly during the interview",
        "I was present during the entire lecture",
        "I submitted the assignment myself",
        "I did not access any restricted files",
        "I followed the safety guidelines",
        "I was not involved in the incident",

        # ---------- DECEPTION-LIKE STATEMENTS (1) ----------
        "I think maybe I took the money but I am not sure",
        "Honestly I don't really remember what happened",
        "To be honest it all happened very fast",
        "I swear I had nothing to do with it",
        "I might have been there for a short time",
        "I don't exactly recall where I was",
        "As far as I remember I didn’t touch anything",
        "I believe I followed the rules mostly",
        "I guess I was present but I can’t say clearly",
        "I think someone else may have done it",
        "I don’t clearly remember signing that document",
        "I may have seen it but I am not certain",
        "I honestly cannot recall the details",
        "I suppose it could have happened that way",
        "I was probably not involved",
        "I think I told the truth as far as I know",
        "I don’t remember clearly who was there",
        "I might have checked the file briefly",
        "I’m not sure if I entered the room",
        "I can’t really say what happened exactly"
    ],

    "label": [
        # 20 truth labels
        0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,

        # 20 deception labels
        1,1,1,1,1,1,1,1,1,1,
        1,1,1,1,1,1,1,1,1,1
    ]
}

df = pd.DataFrame(data)
df


Unnamed: 0,text,label
0,I was at home watching television the entire e...,0
1,I went to the office and completed my regular ...,0
2,I clearly remember submitting the report on time,0
3,I did not take anything from the desk,0
4,I was with my friends at the cafe,0
5,I followed the instructions exactly as given,0
6,I arrived at the location around 9 AM,0
7,I completed the work before the deadline,0
8,I attended the meeting as scheduled,0
9,I left the building after finishing my shift,0


In [12]:
#text preprocessing

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

df["clean_text"] = df["text"].apply(clean_text)
df


Unnamed: 0,text,label,clean_text
0,I was at home watching television the entire e...,0,home watching television entire evening
1,I went to the office and completed my regular ...,0,went office completed regular tasks
2,I clearly remember submitting the report on time,0,clearly remember submitting report time
3,I did not take anything from the desk,0,take anything desk
4,I was with my friends at the cafe,0,friends cafe
5,I followed the instructions exactly as given,0,followed instructions exactly given
6,I arrived at the location around 9 AM,0,arrived location around
7,I completed the work before the deadline,0,completed work deadline
8,I attended the meeting as scheduled,0,attended meeting scheduled
9,I left the building after finishing my shift,0,left building finishing shift


In [13]:
#feature extraction
vectorizer = TfidfVectorizer(max_features=500)
X = vectorizer.fit_transform(df["clean_text"])
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)


In [14]:
#text model
text_model = LogisticRegression()
text_model.fit(X_train, y_train)

y_pred = text_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.3333333333333333
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         7
           1       0.36      0.80      0.50         5

    accuracy                           0.33        12
   macro avg       0.18      0.40      0.25        12
weighted avg       0.15      0.33      0.21        12



In [15]:
#audio feature extraction
def extract_audio_features(file_path):
    y, sr = librosa.load(file_path)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfcc_mean = np.mean(mfcc.T, axis=0)
    return mfcc_mean


In [18]:
#decision level fusion
def fusion_decision(text_score, audio_score, alpha=0.5):
    final_score = alpha * text_score + (1 - alpha) * audio_score
    return "Deception-Likely" if final_score > 0.5 else "Truth-Likely"


In [17]:
#demo predictions
sample_text = "I honestly don't remember what happened"
cleaned = clean_text(sample_text)
vector = vectorizer.transform([cleaned])

text_prob = text_model.predict_proba(vector)[0][1]

print("Text deception probability:", text_prob)
print("Final Decision:", fusion_decision(text_prob, audio_score=0.6))


Text deception probability: 0.6733301952914628
Final Decision: Deception-Likely


## Multimodal AI System for Deception Pattern Analysis

This project presents a research-oriented artificial intelligence system designed to analyze deception-related behavioral patterns using textual data.  
Natural Language Processing techniques are applied to extract linguistic cues such as uncertainty, hesitation, and over-explanation.  
Textual statements are transformed into numerical representations using TF-IDF vectorization.  
A supervised machine learning classifier is trained to distinguish between truth-like and deception-like statements.  
The system outputs probabilistic predictions rather than absolute truth judgments.  
The dataset used is synthetically constructed based on established deception research patterns.  
Model performance is evaluated using accuracy, precision, recall, and confusion matrix metrics.  
This project emphasizes explainability and responsible AI practices.  
The system is intended strictly for educational and research purposes.  
It does not claim to determine factual truth or guilt.
