In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer


In [1]:
#Load data
import pandas as pd
df = pd.read_csv('corporate_lingo.csv')
print(df.head())

               Term                                         Definition  \
0               2.0                 the improved version of something.   
1  30,000-feet view        looking at something in the broadest sense.   
2             80/20  finding the way that will lead to the most pro...   
3       Action item                    something that has to get done.   
4        Actionable                                                NaN   

                                             Example  
0           "Let's call this the Strategic Plan 2.0"  
1       "At the 30,000-feet view, the problem is..."  
2  "We don't have time to do a full analysis. Let...  
3    "Did any action items come out of the meeting?"  
4  "Your email isn't actionable enough. What do y...  


In [4]:
# Combine 'Definition' and 'Example' columns, handling NaN values
df['Text'] = df['Definition'].fillna('') + ' ' + df['Example'].fillna('')

In [8]:

# Remove any rows where 'Text' is empty after combining
df = df[df['Text'].str.strip() != '']

# Prepare features (X) and target (y)
X = df['Text']
y = df['Term']

In [9]:
X_train, X_test,y_train,y_test = train_test_split(X,y, test_size =0.2,random_state=42)

In [10]:
# Initialize and fit the TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [38]:
# Initialize models
models = {
    'Multinomial Naive Bayes': MultinomialNB(),
    'Linear SVM': LinearSVC(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42)
}

models['TfidfVectorizer'] = vectorizer

In [16]:
for name, model in models.items():
    model.fit(X_train_vectorized, y_train)
    y_pred = model.predict(X_test_vectorized)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'\n{name}:')
    print(f"Accuracy: {accuracy:.4f}")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))


Multinomial Naive Bayes:
Accuracy: 0.0000
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
                                       precision    recall  f1-score   support

                                  2.0       0.00      0.00      0.00       0.0
                                80/20       0.00      0.00      0.00       0.0
                                   AP       0.00      0.00      0.00       1.0
                            Add value       0.00      0.00      0.00       1.0
                               Agenda       0.00      0.00      0.00       0.0
                                  B2B       0.00      0.00      0.00       0.0
                                  B2C       0.00      0.00      0.00       1.0
               Backburner/Frontburner       0.00      0.00      0.00       1.0
                            Bandwidth       0.00      0.00      0.00       1.0
                           Bellwether       0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Random Forest:
Accuracy: 0.0000
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
                                       precision    recall  f1-score   support

                                80/20       0.00      0.00      0.00       0.0
                                   AP       0.00      0.00      0.00       1.0
                            Add value       0.00      0.00      0.00       1.0
                              Adjourn       0.00      0.00      0.00       0.0
                                  B2B       0.00      0.00      0.00       0.0
                                  B2C       0.00      0.00      0.00       1.0
               Backburner/Frontburner       0.00      0.00      0.00       1.0
                            Bandwidth       0.00      0.00      0.00       1.0
                           Bellwether       0.00      0.00      0.00       1.0
                Blocking and tackling       0.00      0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Logistic Regression:
Accuracy: 0.0000
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 1 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
                                       precision    recall  f1-score   support

                                80/20       0.00      0.00      0.00       0.0
                                   AP       0.00      0.00      0.00       1.0
                            Add value       0.00      0.00      0.00       1.0
                               Agenda       0.00      0.00      0.00       0.0
                                  B2B       0.00      0.00      0.00       0.0
                                  B2C       0.00      0.00      0.00       1.0
               Backburner/Frontburner       0.00      0.00      0.00       1.0
                            Bandwidth       0.00      0.00      0.00       1.0
                           Bellwether       0.00      0.00      0.00       1.0
                             Blessing       0.00  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
# Example of prediction
def predict_term(text, model, vectorizer):
    text_vectorized = vectorizer.transform([text])
    return model.predict(text_vectorized)[0]

In [19]:

# Use the best performing model for prediction (replace with your best model)
best_model = models['Logistic Regression']  # Change this to your best model
example_text = "The ball is in the legal department's court. We need their approval before we can proceed."
predicted_term = predict_term(example_text, best_model, vectorizer)
print(f"\nPredicted term for '{example_text}': {predicted_term}")


Predicted term for 'The ball is in the legal department's court. We need their approval before we can proceed.': Ball in \[someone's\] court


In [20]:

# Use the best performing model for prediction (replace with your best model)
best_model = models['Random Forest']  # Change this to your best model
example_text = "The ball is in the legal department's court. We need their approval before we can proceed."
predicted_term = predict_term(example_text, best_model, vectorizer)
print(f"\nPredicted term for '{example_text}': {predicted_term}")


Predicted term for 'The ball is in the legal department's court. We need their approval before we can proceed.': Ball in \[someone's\] court


In [21]:

# Use the best performing model for prediction (replace with your best model)
best_model = models['Linear SVM']  # Change this to your best model
example_text = "as soon as possible"
predicted_term = predict_term(example_text, best_model, vectorizer)
print(f"\nPredicted term for '{example_text}': {predicted_term}")


Predicted term for 'as soon as possible': ASAP


In [22]:

# Use the best performing model for prediction (replace with your best model)
best_model = models['Multinomial Naive Bayes']  # Change this to your best model
example_text = ""
predicted_term = predict_term(example_text, best_model, vectorizer)
print(f"\nPredicted term for '{example_text}': {predicted_term}")


Predicted term for 'let me circle back to you on this': Circle back


In [23]:
def test_model(model, vectorizer, examples):
    for example in examples:
        prediction = predict_term(example, model, vectorizer)
        print(f"Input: '{example}'\nPredicted: {prediction}\n")

# List of test examples
test_examples = [
    "Let's circle back on this issue next week.",
    "The weather is nice today.",
    "We need to move the needle on this project.",
    "I'm going to the grocery store after work.",
    "Can you give me a high-level overview of the situation?",
    "Let's discuss this over lunch.",
    "We should leverage our core competencies.",
    "What time does the movie start?",
    "Let's run the numbers again.",
    "We need to push this live."
]

print("Testing the model with example phrases:")
test_model(best_model, vectorizer, test_examples)

Testing the model with example phrases:
Input: 'Let's circle back on this issue next week.'
Predicted: Can of worms

Input: 'The weather is nice today.'
Predicted: On my radar

Input: 'We need to move the needle on this project.'
Predicted: Move the needle / Move the dial

Input: 'I'm going to the grocery store after work.'
Predicted: Lever... to pull

Input: 'Can you give me a high-level overview of the situation?'
Predicted: Take to the next level

Input: 'Let's discuss this over lunch.'
Predicted: Touch base

Input: 'We should leverage our core competencies.'
Predicted: Leverage

Input: 'What time does the movie start?'
Predicted: Resonate

Input: 'Let's run the numbers again.'
Predicted: Deck

Input: 'We need to push this live.'
Predicted: Push back



In [24]:
import pickle

In [28]:
with open('models/multinomial_nb.pkl','wb') as f:
    pickle.dump(models['Multinomial Naive Bayes'],f)
    
with open('models/multinomial_nb.pkl','rb') as f:
    loaded_model = pickle.load(f)

In [29]:
loaded_model

In [33]:
with open('models/linear_svc.pkl','wb') as f:
    pickle.dump(models['Linear SVM'],f)


In [31]:
with open('models/random_forest.pkl','wb') as f:
    pickle.dump(models['Random Forest'],f)


In [32]:
with open('models/logistic_regression.pkl','wb') as f:
    pickle.dump(models['Logistic Regression'],f)


In [39]:
with open('models/tfidf_vectorizer.pkl','wb') as f:
    pickle.dump(models['TfidfVectorizer'],f)
